diff --git a/models/cv/classification/efficientnet_v2/ixrt/README.md b/models/cv/classification/efficientnet_v2/ixrt/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..3ca2d3b70b4515f3c3f874fff941ac48c37ac237
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/README.md
@@ -0,0 +1,73 @@
+# EfficientnetV2
+
+## Description
+EfficientNetV2 is an improved version of the EfficientNet architecture proposed by Google, aiming to enhance model performance and efficiency. Unlike the original EfficientNet, EfficientNetV2 features a simplified design and incorporates a series of enhancement strategies to further boost performance.
+
+## Setup
+
+### Install
+```bash
+yum install mesa-libGL
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install tabulate
+pip3 install timm
+pip3 install ppq
+pip3 install pycuda
+pip3 install protobuf==3.20.0
+```
+
+### Download
+Pretrained model: <https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_t_agc-3620981a.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+```bash
+mkdir checkpoints
+git clone https://github.com/huggingface/pytorch-image-models.git
+cp /Path/to/ixrt/export_onnx.py pytorch-image-models/timm/models
+cd pytorch-image-models/timm/models
+rm _builder.py
+mv /Path/ixrt/_builder.py pytorch-image-models/timm/models
+cd pytorch-image-models/timm
+mkdir -p /root/.cache/torch/hub/checkpoints/
+wget -P /root/.cache/torch/hub/checkpoints/ https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_t_agc-3620981a.pth
+python3 -m models.export_onnx --output_model checkpoints/efficientnet.onnx
+```
+
+## Inference
+```bash
+export PROJ_DIR=/Path/to/efficientnet_v2/ixrt
+export DATASETS_DIR=/path/to/imagenet_val/
+export CHECKPOINTS_DIR=./checkpoints
+export RUN_DIR=/Path/to/efficientnet_v2/ixrt
+export CONFIG_DIR=/Path/to/config/EFFICIENTNET_V2T_CONFIG
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+```
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_efficientnet_fp16_accuracy.sh
+# Performance
+bash scripts/infer_efficientnet_fp16_performance.sh
+```
+
+### INT8
+```bash
+# Accuracy
+bash scripts/infer_efficientnet_int8_accuracy.sh
+# Performance
+bash scripts/infer_efficientnet_int8_performance.sh
+```
+
+
+
+## Results
+
+Model          | BatchSize | Precision |   FPS    | Top-1(%) | Top-5(%)
+---------------|-----------|-----------|----------|----------|--------
+EfficientnetV2 |    32     |   FP16    | 1882.87  |  82.14   | 96.16
+EfficientnetV2 |    32     |   INT8    | 2595.96  |  81.50   | 95.96
diff --git a/models/cv/classification/efficientnet_v2/ixrt/_builder.py b/models/cv/classification/efficientnet_v2/ixrt/_builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..7246c0d57abaffac744e861b959b9169121265c3
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/_builder.py
@@ -0,0 +1,480 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import dataclasses
+import logging
+import os
+from copy import deepcopy
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from torch import nn as nn
+from torch.hub import load_state_dict_from_url
+
+from timm.models._features import FeatureListNet, FeatureDictNet, FeatureHookNet, FeatureGetterNet
+from timm.models._features_fx import FeatureGraphNet
+from timm.models._helpers import load_state_dict
+from timm.models._hub import has_hf_hub, download_cached_file, check_cached_file, load_state_dict_from_hf,\
+    load_custom_from_hf
+from timm.models._manipulate import adapt_input_conv
+from timm.models._pretrained import PretrainedCfg
+from timm.models._prune import adapt_model_from_file
+from timm.models._registry import get_pretrained_cfg
+
+_logger = logging.getLogger(__name__)
+
+# Global variables for rarely used pretrained checkpoint download progress and hash check.
+# Use set_pretrained_download_progress / set_pretrained_check_hash functions to toggle.
+_DOWNLOAD_PROGRESS = False
+_CHECK_HASH = False
+_USE_OLD_CACHE = int(os.environ.get('TIMM_USE_OLD_CACHE', 0)) > 0
+
+__all__ = ['set_pretrained_download_progress', 'set_pretrained_check_hash', 'load_custom_pretrained', 'load_pretrained',
+           'pretrained_cfg_for_features', 'resolve_pretrained_cfg', 'build_model_with_cfg']
+
+
+def _resolve_pretrained_source(pretrained_cfg):
+    cfg_source = pretrained_cfg.get('source', '')
+    pretrained_url = pretrained_cfg.get('url', None)
+    pretrained_file = pretrained_cfg.get('file', None)
+    pretrained_sd = pretrained_cfg.get('state_dict', None)
+    hf_hub_id = pretrained_cfg.get('hf_hub_id', None)
+
+    # resolve where to load pretrained weights from
+    load_from = ''
+    pretrained_loc = ''
+    if cfg_source == 'hf-hub' and has_hf_hub(necessary=True):
+        # hf-hub specified as source via model identifier
+        load_from = 'hf-hub'
+        assert hf_hub_id
+        pretrained_loc = hf_hub_id
+    else:
+        # default source == timm or unspecified
+        if pretrained_sd:
+            # direct state_dict pass through is the highest priority
+            load_from = 'state_dict'
+            pretrained_loc = pretrained_sd
+            assert isinstance(pretrained_loc, dict)
+        elif pretrained_file:
+            # file load override is the second-highest priority if set
+            load_from = 'file'
+            pretrained_loc = pretrained_file
+        else:
+            old_cache_valid = False
+            if _USE_OLD_CACHE:
+                # prioritized old cached weights if exists and env var enabled
+                old_cache_valid = check_cached_file(pretrained_url) if pretrained_url else False
+            if not old_cache_valid and hf_hub_id and has_hf_hub(necessary=True):
+                # hf-hub available as alternate weight source in default_cfg
+                load_from = 'hf-hub'
+                pretrained_loc = hf_hub_id
+            elif pretrained_url:
+                load_from = 'url'
+                pretrained_loc = pretrained_url
+
+    if load_from == 'hf-hub' and pretrained_cfg.get('hf_hub_filename', None):
+        # if a filename override is set, return tuple for location w/ (hub_id, filename)
+        pretrained_loc = pretrained_loc, pretrained_cfg['hf_hub_filename']
+    return load_from, pretrained_loc
+
+
+def set_pretrained_download_progress(enable=True):
+    """ Set download progress for pretrained weights on/off (globally). """
+    global _DOWNLOAD_PROGRESS
+    _DOWNLOAD_PROGRESS = enable
+
+
+def set_pretrained_check_hash(enable=True):
+    """ Set hash checking for pretrained weights on/off (globally). """
+    global _CHECK_HASH
+    _CHECK_HASH = enable
+
+
+def load_custom_pretrained(
+        model: nn.Module,
+        pretrained_cfg: Optional[Dict] = None,
+        load_fn: Optional[Callable] = None,
+):
+    r"""Loads a custom (read non .pth) weight file
+
+    Downloads checkpoint file into cache-dir like torch.hub based loaders, but calls
+    a passed in custom load fun, or the `load_pretrained` model member fn.
+
+    If the object is already present in `model_dir`, it's deserialized and returned.
+    The default value of `model_dir` is ``<hub_dir>/checkpoints`` where
+    `hub_dir` is the directory returned by :func:`~torch.hub.get_dir`.
+
+    Args:
+        model: The instantiated model to load weights into
+        pretrained_cfg (dict): Default pretrained model cfg
+        load_fn: An external standalone fn that loads weights into provided model, otherwise a fn named
+            'laod_pretrained' on the model will be called if it exists
+    """
+    pretrained_cfg = pretrained_cfg or getattr(model, 'pretrained_cfg', None)
+    if not pretrained_cfg:
+        _logger.warning("Invalid pretrained config, cannot load weights.")
+        return
+
+    load_from, pretrained_loc = _resolve_pretrained_source(pretrained_cfg)
+    if not load_from:
+        _logger.warning("No pretrained weights exist for this model. Using random initialization.")
+        return
+    if load_from == 'hf-hub':
+        _logger.warning("Hugging Face hub not currently supported for custom load pretrained models.")
+    elif load_from == 'url':
+        pretrained_loc = download_cached_file(
+            pretrained_loc,
+            check_hash=_CHECK_HASH,
+            progress=_DOWNLOAD_PROGRESS,
+        )
+
+    if load_fn is not None:
+        load_fn(model, pretrained_loc)
+    elif hasattr(model, 'load_pretrained'):
+        model.load_pretrained(pretrained_loc)
+    else:
+        _logger.warning("Valid function to load pretrained weights is not available, using random initialization.")
+
+
+def load_pretrained(
+        model: nn.Module,
+        pretrained_cfg: Optional[Dict] = None,
+        num_classes: int = 1000,
+        in_chans: int = 3,
+        filter_fn: Optional[Callable] = None,
+        strict: bool = True,
+):
+    """ Load pretrained checkpoint
+
+    Args:
+        model (nn.Module) : PyTorch model module
+        pretrained_cfg (Optional[Dict]): configuration for pretrained weights / target dataset
+        num_classes (int): num_classes for target model
+        in_chans (int): in_chans for target model
+        filter_fn (Optional[Callable]): state_dict filter fn for load (takes state_dict, model as args)
+        strict (bool): strict load of checkpoint
+
+    """
+    pretrained_cfg = pretrained_cfg or getattr(model, 'pretrained_cfg', None)
+    if not pretrained_cfg:
+        raise RuntimeError("Invalid pretrained config, cannot load weights. Use `pretrained=False` for random init.")
+
+    load_from, pretrained_loc = _resolve_pretrained_source(pretrained_cfg)
+    load_from = 'url'
+    pretrained_loc = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_t_agc-3620981a.pth'
+    if load_from == 'state_dict':
+        _logger.info(f'Loading pretrained weights from state dict')
+        state_dict = pretrained_loc  # pretrained_loc is the actual state dict for this override
+    elif load_from == 'file':
+        _logger.info(f'Loading pretrained weights from file ({pretrained_loc})')
+        if pretrained_cfg.get('custom_load', False):
+            model.load_pretrained(pretrained_loc)
+            return
+        else:
+            state_dict = load_state_dict(pretrained_loc)
+    elif load_from == 'url':
+        _logger.info(f'Loading pretrained weights from url ({pretrained_loc})')
+        if pretrained_cfg.get('custom_load', False):
+            pretrained_loc = download_cached_file(
+                pretrained_loc,
+                progress=_DOWNLOAD_PROGRESS,
+                check_hash=_CHECK_HASH,
+            )
+            model.load_pretrained("/home/xinchi.tian/pytorch-image-models/timm/models/efficientnetv2_t_agc-3620981a.pth")
+            return
+        else:
+            state_dict = load_state_dict_from_url(
+                pretrained_loc,
+                map_location='cpu',
+                progress=_DOWNLOAD_PROGRESS,
+                check_hash=_CHECK_HASH,
+            )
+    elif load_from == 'hf-hub':
+        _logger.info(f'Loading pretrained weights from Hugging Face hub ({pretrained_loc})')
+        if isinstance(pretrained_loc, (list, tuple)):
+            custom_load = pretrained_cfg.get('custom_load', False)
+            if isinstance(custom_load, str) and custom_load == 'hf':
+                load_custom_from_hf(*pretrained_loc, model)
+                return
+            else:
+                state_dict = load_state_dict_from_hf(*pretrained_loc)
+        else:
+            state_dict = load_state_dict_from_hf(pretrained_loc)
+    else:
+        model_name = pretrained_cfg.get('architecture', 'this model')
+        raise RuntimeError(f"No pretrained weights exist for {model_name}. Use `pretrained=False` for random init.")
+
+    if filter_fn is not None:
+        try:
+            state_dict = filter_fn(state_dict, model)
+        except TypeError as e:
+            # for backwards compat with filter fn that take one arg
+            state_dict = filter_fn(state_dict)
+
+    input_convs = pretrained_cfg.get('first_conv', None)
+    if input_convs is not None and in_chans != 3:
+        if isinstance(input_convs, str):
+            input_convs = (input_convs,)
+        for input_conv_name in input_convs:
+            weight_name = input_conv_name + '.weight'
+            try:
+                state_dict[weight_name] = adapt_input_conv(in_chans, state_dict[weight_name])
+                _logger.info(
+                    f'Converted input conv {input_conv_name} pretrained weights from 3 to {in_chans} channel(s)')
+            except NotImplementedError as e:
+                del state_dict[weight_name]
+                strict = False
+                _logger.warning(
+                    f'Unable to convert pretrained {input_conv_name} weights, using random init for this layer.')
+
+    classifiers = pretrained_cfg.get('classifier', None)
+    label_offset = pretrained_cfg.get('label_offset', 0)
+    if classifiers is not None:
+        if isinstance(classifiers, str):
+            classifiers = (classifiers,)
+        if num_classes != pretrained_cfg['num_classes']:
+            for classifier_name in classifiers:
+                # completely discard fully connected if model num_classes doesn't match pretrained weights
+                state_dict.pop(classifier_name + '.weight', None)
+                state_dict.pop(classifier_name + '.bias', None)
+            strict = False
+        elif label_offset > 0:
+            for classifier_name in classifiers:
+                # special case for pretrained weights with an extra background class in pretrained weights
+                classifier_weight = state_dict[classifier_name + '.weight']
+                state_dict[classifier_name + '.weight'] = classifier_weight[label_offset:]
+                classifier_bias = state_dict[classifier_name + '.bias']
+                state_dict[classifier_name + '.bias'] = classifier_bias[label_offset:]
+
+    load_result = model.load_state_dict(state_dict, strict=strict)
+    if load_result.missing_keys:
+        _logger.info(
+            f'Missing keys ({", ".join(load_result.missing_keys)}) discovered while loading pretrained weights.'
+            f' This is expected if model is being adapted.')
+    if load_result.unexpected_keys:
+        _logger.warning(
+            f'Unexpected keys ({", ".join(load_result.unexpected_keys)}) found while loading pretrained weights.'
+            f' This may be expected if model is being adapted.')
+
+
+def pretrained_cfg_for_features(pretrained_cfg):
+    pretrained_cfg = deepcopy(pretrained_cfg)
+    # remove default pretrained cfg fields that don't have much relevance for feature backbone
+    to_remove = ('num_classes', 'classifier', 'global_pool')  # add default final pool size?
+    for tr in to_remove:
+        pretrained_cfg.pop(tr, None)
+    return pretrained_cfg
+
+
+def _filter_kwargs(kwargs, names):
+    if not kwargs or not names:
+        return
+    for n in names:
+        kwargs.pop(n, None)
+
+
+def _update_default_model_kwargs(pretrained_cfg, kwargs, kwargs_filter):
+    """ Update the default_cfg and kwargs before passing to model
+
+    Args:
+        pretrained_cfg: input pretrained cfg (updated in-place)
+        kwargs: keyword args passed to model build fn (updated in-place)
+        kwargs_filter: keyword arg keys that must be removed before model __init__
+    """
+    # Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs)
+    default_kwarg_names = ('num_classes', 'global_pool', 'in_chans')
+    if pretrained_cfg.get('fixed_input_size', False):
+        # if fixed_input_size exists and is True, model takes an img_size arg that fixes its input size
+        default_kwarg_names += ('img_size',)
+
+    for n in default_kwarg_names:
+        # for legacy reasons, model __init__args uses img_size + in_chans as separate args while
+        # pretrained_cfg has one input_size=(C, H ,W) entry
+        if n == 'img_size':
+            input_size = pretrained_cfg.get('input_size', None)
+            if input_size is not None:
+                assert len(input_size) == 3
+                kwargs.setdefault(n, input_size[-2:])
+        elif n == 'in_chans':
+            input_size = pretrained_cfg.get('input_size', None)
+            if input_size is not None:
+                assert len(input_size) == 3
+                kwargs.setdefault(n, input_size[0])
+        elif n == 'num_classes':
+            default_val = pretrained_cfg.get(n, None)
+            # if default is < 0, don't pass through to model
+            if default_val is not None and default_val >= 0:
+                kwargs.setdefault(n, pretrained_cfg[n])
+        else:
+            default_val = pretrained_cfg.get(n, None)
+            if default_val is not None:
+                kwargs.setdefault(n, pretrained_cfg[n])
+
+    # Filter keyword args for task specific model variants (some 'features only' models, etc.)
+    _filter_kwargs(kwargs, names=kwargs_filter)
+
+
+def resolve_pretrained_cfg(
+        variant: str,
+        pretrained_cfg=None,
+        pretrained_cfg_overlay=None,
+) -> PretrainedCfg:
+    model_with_tag = variant
+    pretrained_tag = None
+    if pretrained_cfg:
+        if isinstance(pretrained_cfg, dict):
+            # pretrained_cfg dict passed as arg, validate by converting to PretrainedCfg
+            pretrained_cfg = PretrainedCfg(**pretrained_cfg)
+        elif isinstance(pretrained_cfg, str):
+            pretrained_tag = pretrained_cfg
+            pretrained_cfg = None
+
+    # fallback to looking up pretrained cfg in model registry by variant identifier
+    if not pretrained_cfg:
+        if pretrained_tag:
+            model_with_tag = '.'.join([variant, pretrained_tag])
+        pretrained_cfg = get_pretrained_cfg(model_with_tag)
+
+    if not pretrained_cfg:
+        _logger.warning(
+            f"No pretrained configuration specified for {model_with_tag} model. Using a default."
+            f" Please add a config to the model pretrained_cfg registry or pass explicitly.")
+        pretrained_cfg = PretrainedCfg()  # instance with defaults
+
+    pretrained_cfg_overlay = pretrained_cfg_overlay or {}
+    if not pretrained_cfg.architecture:
+        pretrained_cfg_overlay.setdefault('architecture', variant)
+    pretrained_cfg = dataclasses.replace(pretrained_cfg, **pretrained_cfg_overlay)
+
+    return pretrained_cfg
+
+
+def build_model_with_cfg(
+        model_cls: Callable,
+        variant: str,
+        pretrained: bool,
+        pretrained_cfg: Optional[Dict] = None,
+        pretrained_cfg_overlay: Optional[Dict] = None,
+        model_cfg: Optional[Any] = None,
+        feature_cfg: Optional[Dict] = None,
+        pretrained_strict: bool = True,
+        pretrained_filter_fn: Optional[Callable] = None,
+        kwargs_filter: Optional[Tuple[str]] = None,
+        **kwargs,
+):
+    """ Build model with specified default_cfg and optional model_cfg
+
+    This helper fn aids in the construction of a model including:
+      * handling default_cfg and associated pretrained weight loading
+      * passing through optional model_cfg for models with config based arch spec
+      * features_only model adaptation
+      * pruning config / model adaptation
+
+    Args:
+        model_cls: model class
+        variant: model variant name
+        pretrained: load pretrained weights
+        pretrained_cfg: model's pretrained weight/task config
+        model_cfg: model's architecture config
+        feature_cfg: feature extraction adapter config
+        pretrained_strict: load pretrained weights strictly
+        pretrained_filter_fn: filter callable for pretrained weights
+        kwargs_filter: kwargs to filter before passing to model
+        **kwargs: model args passed through to model __init__
+    """
+    pruned = kwargs.pop('pruned', False)
+    features = False
+    feature_cfg = feature_cfg or {}
+
+    # resolve and update model pretrained config and model kwargs
+    pretrained_cfg = resolve_pretrained_cfg(
+        variant,
+        pretrained_cfg=pretrained_cfg,
+        pretrained_cfg_overlay=pretrained_cfg_overlay
+    )
+
+    # FIXME converting back to dict, PretrainedCfg use should be propagated further, but not into model
+    pretrained_cfg = pretrained_cfg.to_dict()
+
+    _update_default_model_kwargs(pretrained_cfg, kwargs, kwargs_filter)
+
+    # Setup for feature extraction wrapper done at end of this fn
+    if kwargs.pop('features_only', False):
+        features = True
+        feature_cfg.setdefault('out_indices', (0, 1, 2, 3, 4))
+        if 'out_indices' in kwargs:
+            feature_cfg['out_indices'] = kwargs.pop('out_indices')
+        if 'feature_cls' in kwargs:
+            feature_cfg['feature_cls'] = kwargs.pop('feature_cls')
+
+    # Instantiate the model
+    if model_cfg is None:
+        model = model_cls(**kwargs)
+    else:
+        model = model_cls(cfg=model_cfg, **kwargs)
+    model.pretrained_cfg = pretrained_cfg
+    model.default_cfg = model.pretrained_cfg  # alias for backwards compat
+
+    if pruned:
+        model = adapt_model_from_file(model, variant)
+
+    # For classification models, check class attr, then kwargs, then default to 1k, otherwise 0 for feats
+    num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000))
+    if pretrained:
+        load_pretrained(
+            model,
+            pretrained_cfg=pretrained_cfg,
+            num_classes=num_classes_pretrained,
+            in_chans=kwargs.get('in_chans', 3),
+            filter_fn=pretrained_filter_fn,
+            strict=pretrained_strict,
+        )
+
+    # Wrap the model in a feature extraction module if enabled
+    if features:
+        use_getter = False
+        if 'feature_cls' in feature_cfg:
+            feature_cls = feature_cfg.pop('feature_cls')
+            if isinstance(feature_cls, str):
+                feature_cls = feature_cls.lower()
+
+                # flatten_sequential only valid for some feature extractors
+                if feature_cls not in ('dict', 'list', 'hook'):
+                    feature_cfg.pop('flatten_sequential', None)
+
+                if 'hook' in feature_cls:
+                    feature_cls = FeatureHookNet
+                elif feature_cls == 'list':
+                    feature_cls = FeatureListNet
+                elif feature_cls == 'dict':
+                    feature_cls = FeatureDictNet
+                elif feature_cls == 'fx':
+                    feature_cls = FeatureGraphNet
+                elif feature_cls == 'getter':
+                    use_getter = True
+                    feature_cls = FeatureGetterNet
+                else:
+                    assert False, f'Unknown feature class {feature_cls}'
+        else:
+            feature_cls = FeatureListNet
+
+        output_fmt = getattr(model, 'output_fmt', None)
+        if output_fmt is not None and not use_getter:  # don't set default for intermediate feat getter
+            feature_cfg.setdefault('output_fmt', output_fmt)
+
+        model = feature_cls(model, **feature_cfg)
+        model.pretrained_cfg = pretrained_cfg_for_features(pretrained_cfg)  # add back pretrained cfg
+        model.default_cfg = model.pretrained_cfg  # alias for rename backwards compat (default_cfg -> pretrained_cfg)
+
+    return model
diff --git a/models/cv/classification/efficientnet_v2/ixrt/build_engine.py b/models/cv/classification/efficientnet_v2/ixrt/build_engine.py
new file mode 100755
index 0000000000000000000000000000000000000000..41e6af8d2585da92d7f0354f7ced4ea4978bd652
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/build_engine.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from calibration_dataset import getdataloader
+import cuda.cudart as cudart
+
+def assertSuccess(err):
+    assert(err == cudart.cudaError_t.cudaSuccess)
+
+class EngineCalibrator(tensorrt.IInt8EntropyCalibrator2):
+
+    def __init__(self, cache_file, datasets_dir, loop_count=10, bsz=1, img_sz=224):
+        super().__init__()
+        self.cache_file = cache_file
+        self.image_batcher  = getdataloader(datasets_dir, loop_count, batch_size=bsz, img_sz=img_sz)
+        self.batch_generator = iter(self.image_batcher)
+        size = img_sz*img_sz*3*bsz
+        __import__('pdb').set_trace()
+        err, self.batch_allocation = cudart.cudaMalloc(size)
+        assertSuccess(err)
+
+    def __del__(self):
+        err,= cudart.cudaFree(self.batch_allocation)
+        assertSuccess(err)
+
+    def get_batch_size(self):
+        return self.image_batcher.batch_size
+
+    def get_batch(self, names):
+        try:
+            batch, _ = next(self.batch_generator)
+            batch = batch.numpy()
+            __import__('pdb').set_trace()
+            cudart.cudaMemcpy(self.batch_allocation,
+                              np.ascontiguousarray(batch),
+                              batch.nbytes,
+                              cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+            return [int(self.batch_allocation)]
+        except StopIteration:
+            return None
+
+    def read_calibration_cache(self):
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, "rb") as f:
+                return f.read()
+
+    def write_calibration_cache(self, cache):
+        with open(self.cache_file, "wb") as f:
+            f.write(cache)
+
+def main(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.VERBOSE)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    print("precision : ", precision)
+    build_config.set_flag(precision)
+    if config.precision == "int8":
+        build_config.int8_calibrator = EngineCalibrator("int8_cache", config.datasets_dir)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--engine", type=str, default=None)
+    parser.add_argument(
+        "--datasets_dir",
+        type=str,
+        default="",
+        help="ImageNet dir",
+    )
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/build_i8_engine.py b/models/cv/classification/efficientnet_v2/ixrt/build_i8_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e356260376e8fc527251b5c842bbea535ffedcd
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/build_i8_engine.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+import json
+import os
+
+import tensorrt
+import tensorrt as trt
+
+TRT_LOGGER = trt.Logger(tensorrt.Logger.VERBOSE)
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+
+def GiB(val):
+    return val * 1 << 30
+
+
+def json_load(filename):
+    with open(filename) as json_file:
+        data = json.load(json_file)
+    return data
+
+
+def setDynamicRange(network, json_file):
+    """Sets ranges for network layers."""
+    quant_param_json = json_load(json_file)
+    act_quant = quant_param_json["act_quant_info"]
+
+    for i in range(network.num_inputs):
+        input_tensor = network.get_input(i)
+        if act_quant.__contains__(input_tensor.name):
+            print(input_tensor.name)
+            value = act_quant[input_tensor.name]
+            tensor_max = abs(value)
+            tensor_min = -abs(value)
+            input_tensor.dynamic_range = (tensor_min, tensor_max)
+
+    for i in range(network.num_layers):
+        layer = network.get_layer(i)
+
+        for output_index in range(layer.num_outputs):
+            tensor = layer.get_output(output_index)
+
+            if act_quant.__contains__(tensor.name):
+                value = act_quant[tensor.name]
+                tensor_max = abs(value)
+                tensor_min = -abs(value)
+                tensor.dynamic_range = (tensor_min, tensor_max)
+            else:
+                print("\033[1;32m%s\033[0m" % tensor.name)
+
+
+def build_engine(onnx_file, json_file, engine_file):
+    builder = trt.Builder(TRT_LOGGER)
+    network = builder.create_network(EXPLICIT_BATCH)
+
+    config = builder.create_builder_config()
+
+    # If it is a dynamic onnx model , you need to add the following.
+    # profile = builder.create_optimization_profile()
+    # profile.set_shape("input_name", (batch, channels, min_h, min_w), (batch, channels, opt_h, opt_w), (batch, channels, max_h, max_w))
+    # config.add_optimization_profile(profile)
+
+    parser = trt.OnnxParser(network, TRT_LOGGER)
+    # config.max_workspace_size = GiB(1)
+    if not os.path.exists(onnx_file):
+        quit("ONNX file {} not found".format(onnx_file))
+
+    with open(onnx_file, "rb") as model:
+        if not parser.parse(model.read()):
+            print("ERROR: Failed to parse the ONNX file.")
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+            return None
+
+    config.set_flag(trt.BuilderFlag.INT8)
+
+    setDynamicRange(network, json_file)
+
+    engine = builder.build_engine(network, config)
+
+    with open(engine_file, "wb") as f:
+        f.write(engine.serialize())
+
+
+if __name__ == "__main__":
+    # Add plugins if needed
+    # import ctypes
+    # ctypes.CDLL("libmmdeploy_tensorrt_ops.so")
+    parser = argparse.ArgumentParser(
+        description="Writing qparams to onnx to convert tensorrt engine."
+    )
+    parser.add_argument("--onnx", type=str, default=None)
+    parser.add_argument("--qparam_json", type=str, default=None)
+    parser.add_argument("--engine", type=str, default=None)
+    arg = parser.parse_args()
+
+    build_engine(arg.onnx, arg.qparam_json, arg.engine)
+    print("\033[1;32mgenerate %s\033[0m" % arg.engine)
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/calibration_dataset.py b/models/cv/classification/efficientnet_v2/ixrt/calibration_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7525d5136168cc8fb1d24a28f1b71b85ce4cc92
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/calibration_dataset.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from torchvision import models
+from torchvision import transforms as T
+
+
+class CalibrationImageNet(torchvision.datasets.ImageFolder):
+    def __init__(self, *args, **kwargs):
+        super(CalibrationImageNet, self).__init__(*args, **kwargs)
+        img2label_path = os.path.join(self.root, "val_map.txt")
+        if not os.path.exists(img2label_path):
+            raise FileNotFoundError(f"Not found label file `{img2label_path}`.")
+
+        self.img2label_map = self.make_img2label_map(img2label_path)
+
+    def make_img2label_map(self, path):
+        with open(path) as f:
+            lines = f.readlines()
+
+        img2lable_map = dict()
+        for line in lines:
+            line = line.lstrip().rstrip().split("\t")
+            if len(line) != 2:
+                continue
+            img_name, label = line
+            img_name = img_name.strip()
+            if img_name in [None, ""]:
+                continue
+            label = int(label.strip())
+            img2lable_map[img_name] = label
+        return img2lable_map
+
+    def __getitem__(self, index):
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        # if self.target_transform is not None:
+        #     target = self.target_transform(target)
+        img_name = os.path.basename(path)
+        target = self.img2label_map[img_name]
+
+        return sample, target
+
+
+def create_dataloaders(data_path, num_samples=1024, img_sz=224, batch_size=2, workers=0):
+    dataset = CalibrationImageNet(
+        data_path,
+        transform=T.Compose(
+            [
+                T.Resize(256),
+                T.CenterCrop(img_sz),
+                T.ToTensor(),
+                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        ),
+    )
+
+    calibration_dataset = dataset
+    if num_samples is not None:
+        calibration_dataset = torch.utils.data.Subset(
+            dataset, indices=range(num_samples)
+        )
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=False,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+
+    verify_dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+
+    return calibration_dataloader, verify_dataloader
+
+
+def getdataloader(dataset_dir, step=20, batch_size=32, workers=2, img_sz=224, total_sample=50000):
+    num_samples = min(total_sample, step * batch_size)
+    if step < 0:
+        num_samples = None
+    calibration_dataloader, _ = create_dataloaders(
+        dataset_dir,
+        img_sz=img_sz,
+        batch_size=batch_size,
+        workers=workers,
+        num_samples=num_samples,
+    )
+    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/common.py b/models/cv/classification/efficientnet_v2/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..abdc147cb913da3736ab2bc72628dc9cebf78d36
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/common.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import pycuda.driver as cuda
+
+def eval_batch(batch_score, batch_label):
+    batch_score = torch.tensor(torch.from_numpy(batch_score), dtype=torch.float32)
+    values, indices = batch_score.topk(5)
+    top1, top5 = 0, 0
+    for idx, label in enumerate(batch_label):
+
+        if label == indices[idx][0]:
+            top1 += 1
+        if label in indices[idx]:
+            top5 += 1
+    return top1, top5
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
diff --git a/models/cv/classification/efficientnet_v2/ixrt/config/EFFICIENTNET_V2T_CONFIG b/models/cv/classification/efficientnet_v2/ixrt/config/EFFICIENTNET_V2T_CONFIG
new file mode 100644
index 0000000000000000000000000000000000000000..b9e40159818f0dec5fbffff0487b049dea9435ae
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/config/EFFICIENTNET_V2T_CONFIG
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# IMGSIZE : 模型输入hw大小
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件名称
+IMGSIZE=288
+MODEL_NAME=EfficientNetv2_t
+ORIGINE_MODEL=efficientnet.onnx
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+    # QUANT_OBSERVER : 量化策略，可选 [hist_percentile, percentile, minmax, entropy, ema]
+    # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致，有些op可能推导shape错误(比如Reshape)
+    # QUANT_STEP : 量化步数
+    # QUANT_SEED : 随机种子 保证量化结果可复现
+    # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=hist_percentile
+QUANT_BATCHSIZE=32
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=
+QUANT_EXIST_ONNX=
diff --git a/models/cv/classification/efficientnet_v2/ixrt/export_onnx.py b/models/cv/classification/efficientnet_v2/ixrt/export_onnx.py
new file mode 100755
index 0000000000000000000000000000000000000000..4af35a0427b12d42740f13d52794c0c927fac339
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/export_onnx.py
@@ -0,0 +1,950 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from functools import partial
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from ._efficientnet_blocks import SqueezeExcite
+from ._efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights,\
+    round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
+from .features import FeatureInfo, FeatureHooks
+from .helpers import build_model_with_cfg, pretrained_cfg_for_features, checkpoint_seq
+from .layers import create_conv2d, create_classifier, get_norm_act_layer, EvoNorm2dS0, GroupNormAct
+from .registry import register_model
+import argparse
+import ssl
+
+
+
+ssl._create_default_https_context = ssl._create_unverified_context
+
+__all__ = ['EfficientNet', 'EfficientNetFeatures']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 288, 288), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv_stem', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'efficientnetv2_rw_t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_t_agc-3620981a.pth',
+        input_size=(3, 224, 224), test_input_size=(3, 288, 288), pool_size=(7, 7), crop_pct=1.0),
+    'gc_efficientnetv2_rw_t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gc_efficientnetv2_rw_t_agc-927a0bde.pth',
+        input_size=(3, 224, 224), test_input_size=(3, 288, 288), pool_size=(7, 7), crop_pct=1.0),
+    'efficientnetv2_rw_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_v2s_ra2_288-a6477665.pth',
+        input_size=(3, 288, 288), test_input_size=(3, 384, 384), pool_size=(9, 9), crop_pct=1.0),
+    'efficientnetv2_rw_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_rw_m_agc-3d90cb1e.pth',
+        input_size=(3, 320, 320), test_input_size=(3, 416, 416), pool_size=(10, 10), crop_pct=1.0),
+
+    'efficientnetv2_s': _cfg(
+        url='',
+        input_size=(3, 288, 288), test_input_size=(3, 384, 384), pool_size=(9, 9), crop_pct=1.0),
+    'efficientnetv2_m': _cfg(
+        url='',
+        input_size=(3, 320, 320), test_input_size=(3, 416, 416), pool_size=(10, 10), crop_pct=1.0),
+    'efficientnetv2_l': _cfg(
+        url='',
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+    'efficientnetv2_xl': _cfg(
+        url='',
+        input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0),
+}
+
+
+class EfficientNet(nn.Module):
+    """ EfficientNet
+
+    A flexible and performant PyTorch implementation of efficient network architectures, including:
+      * EfficientNet-V2 Small, Medium, Large, XL & B0-B3
+      * EfficientNet B0-B8, L2
+      * EfficientNet-EdgeTPU
+      * EfficientNet-CondConv
+      * MixNet S, M, L, XL
+      * MnasNet A1, B1, and small
+      * MobileNet-V2
+      * FBNet C
+      * Single-Path NAS Pixel1
+      * TinyNet
+    """
+
+    def __init__(
+            self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32, fix_stem=False,
+            output_stride=32, pad_type='', round_chs_fn=round_channels, act_layer=None, norm_layer=None,
+            se_layer=None, drop_rate=0., drop_path_rate=0., global_pool='avg'):
+        super(EfficientNet, self).__init__()
+        act_layer = act_layer or nn.ReLU
+        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        se_layer = se_layer or SqueezeExcite
+        self.num_classes = num_classes
+        self.num_features = num_features
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_chs_fn(stem_size)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn,
+            act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = builder.features
+        head_chs = builder.in_chs
+
+        # Head + Pooling
+        self.conv_head = create_conv2d(head_chs, self.num_features, 1, padding=pad_type)
+        self.bn2 = norm_act_layer(self.num_features, inplace=True)
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+        efficientnet_init_weights(self)
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1]
+        layers.extend(self.blocks)
+        layers.extend([self.conv_head, self.bn2, self.global_pool])
+        layers.extend([nn.Dropout(self.drop_rate), self.classifier])
+        return nn.Sequential(*layers)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^conv_stem|bn1',
+            blocks=[
+                (r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)', None),
+                (r'conv_head|bn2', (99999,))
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x, flatten=True)
+        else:
+            x = self.blocks(x)
+        x = self.conv_head(x)
+        x = self.bn2(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return x if pre_logits else self.classifier(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+class EfficientNetFeatures(nn.Module):
+    """ EfficientNet Feature Extractor
+
+    A work-in-progress feature extraction module for EfficientNet, to use as a backbone for segmentation
+    and object detection models.
+    """
+
+    def __init__(
+            self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck', in_chans=3,
+            stem_size=32, fix_stem=False, output_stride=32, pad_type='', round_chs_fn=round_channels,
+            act_layer=None, norm_layer=None, se_layer=None, drop_rate=0., drop_path_rate=0.):
+        super(EfficientNetFeatures, self).__init__()
+        act_layer = act_layer or nn.ReLU
+        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        se_layer = se_layer or SqueezeExcite
+        self.drop_rate = drop_rate
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_chs_fn(stem_size)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn,
+            act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate,
+            feature_location=feature_location)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = FeatureInfo(builder.features, out_indices)
+        self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices}
+
+        efficientnet_init_weights(self)
+
+        # Register feature extraction hooks with FeatureHooks helper
+        self.feature_hooks = None
+        if feature_location != 'bottleneck':
+            hooks = self.feature_info.get_dicts(keys=('module', 'hook_type'))
+            self.feature_hooks = FeatureHooks(hooks, self.named_modules())
+
+    def forward(self, x) -> List[torch.Tensor]:
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        if self.feature_hooks is None:
+            features = []
+            if 0 in self._stage_out_idx:
+                features.append(x)  # add stem out
+            for i, b in enumerate(self.blocks):
+                x = b(x)
+                if i + 1 in self._stage_out_idx:
+                    features.append(x)
+            return features
+        else:
+            self.blocks(x)
+            out = self.feature_hooks.get_output(x.device)
+            return list(out.values())
+
+
+def _create_effnet(variant, pretrained=False, **kwargs):
+    features_only = False
+    model_cls = EfficientNet
+    kwargs_filter = None
+    if kwargs.pop('features_only', False):
+        features_only = True
+        kwargs_filter = ('num_classes', 'num_features', 'head_conv', 'global_pool')
+        model_cls = EfficientNetFeatures
+    model = build_model_with_cfg(
+        model_cls, variant, pretrained,
+        pretrained_strict=not features_only,
+        kwargs_filter=kwargs_filter,
+        **kwargs)
+    if features_only:
+        model.default_cfg = pretrained_cfg_for_features(model.default_cfg)
+    return model
+
+
+def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-a1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r2_k3_s2_e6_c24'],
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40_se0.25'],
+        # stage 3, 28x28 in
+        ['ir_r4_k3_s2_e6_c80'],
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c112_se0.25'],
+        # stage 5, 14x14in
+        ['ir_r3_k5_s2_e6_c160_se0.25'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-b1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r3_k3_s2_e3_c24'],
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40'],
+        # stage 3, 28x28 in
+        ['ir_r3_k5_s2_e6_c80'],
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c96'],
+        # stage 5, 14x14in
+        ['ir_r4_k5_s2_e6_c192'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320_noskip']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-b1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_c8'],
+        ['ir_r1_k3_s2_e3_c16'],
+        ['ir_r2_k3_s2_e6_c16'],
+        ['ir_r4_k5_s2_e6_c32_se0.25'],
+        ['ir_r3_k3_s1_e6_c32_se0.25'],
+        ['ir_r3_k5_s2_e6_c88_se0.25'],
+        ['ir_r1_k3_s1_e6_c144']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=8,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mobilenet_v2(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs):
+    """ Generate MobileNet-V2 network
+    Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
+    Paper: https://arxiv.org/abs/1801.04381
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r3_k3_s2_e6_c32'],
+        ['ir_r4_k3_s2_e6_c64'],
+        ['ir_r3_k3_s1_e6_c96'],
+        ['ir_r3_k3_s2_e6_c160'],
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head),
+        num_features=1280 if fix_stem_head else max(1280, round_chs_fn(1280)),
+        stem_size=32,
+        fix_stem=fix_stem_head,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """ FBNet-C
+
+        Paper: https://arxiv.org/abs/1812.03443
+        Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
+
+        NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper,
+        it was used to confirm some building block details
+    """
+    arch_def = [
+        ['ir_r1_k3_s1_e1_c16'],
+        ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'],
+        ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'],
+        ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'],
+        ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'],
+        ['ir_r4_k5_s2_e6_c184'],
+        ['ir_r1_k3_s1_e6_c352'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=16,
+        num_features=1984,  # paper suggests this, but is not 100% clear
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates the Single-Path NAS model from search targeted for Pixel1 phone.
+
+    Paper: https://arxiv.org/abs/1904.02877
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r3_k3_s2_e3_c24'],
+        # stage 2, 56x56 in
+        ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'],
+        # stage 3, 28x28 in
+        ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'],
+        # stage 4, 14x14in
+        ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'],
+        # stage 5, 14x14in
+        ['ir_r4_k5_s2_e6_c192'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320_noskip']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, channel_divisor=8,
+        group_size=None, pretrained=False, **kwargs):
+    """Creates an EfficientNet model.
+
+    Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+    Paper: https://arxiv.org/abs/1905.11946
+
+    EfficientNet params
+    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+    'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+    'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+    'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+    'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+    'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+    'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+    'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+    'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+    'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer
+      depth_multiplier: multiplier to number of repeats per stage
+
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'],
+        ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'],
+        ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25'],
+        ['ir_r4_k5_s2_e6_c192_se0.25'],
+        ['ir_r1_k3_s1_e6_c320_se0.25'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier, divisor=channel_divisor)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet_edge(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-EdgeTPU model
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu
+    """
+
+    arch_def = [
+        # NOTE `fc` is present to override a mismatch between stem channels and in chs not
+        # present in other models
+        ['er_r1_k3_s1_e4_c24_fc24_noskip'],
+        ['er_r2_k3_s2_e8_c32'],
+        ['er_r4_k3_s2_e8_c48'],
+        ['ir_r5_k5_s2_e8_c96'],
+        ['ir_r4_k5_s1_e8_c144'],
+        ['ir_r2_k5_s2_e8_c192'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'relu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet_condconv(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
+    """Creates an EfficientNet-CondConv model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'],
+        ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'],
+        ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
+        ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
+        ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
+    ]
+    # NOTE unlike official impl, this one uses `cc<x>` option where x is the base number of experts for each stage and
+    # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates an EfficientNet-Lite model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite
+    Paper: https://arxiv.org/abs/1905.11946
+
+    EfficientNet params
+    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+      'efficientnet-lite0': (1.0, 1.0, 224, 0.2),
+      'efficientnet-lite1': (1.0, 1.1, 240, 0.2),
+      'efficientnet-lite2': (1.1, 1.2, 260, 0.3),
+      'efficientnet-lite3': (1.2, 1.4, 280, 0.3),
+      'efficientnet-lite4': (1.4, 1.8, 300, 0.3),
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer
+      depth_multiplier: multiplier to number of repeats per stage
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r2_k5_s2_e6_c40'],
+        ['ir_r3_k3_s2_e6_c80'],
+        ['ir_r3_k5_s1_e6_c112'],
+        ['ir_r4_k5_s2_e6_c192'],
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True),
+        num_features=1280,
+        stem_size=32,
+        fix_stem=True,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_base(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 base model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+    arch_def = [
+        ['cn_r1_k3_s1_e1_c16_skip'],
+        ['er_r2_k3_s2_e4_c32'],
+        ['er_r2_k3_s2_e4_c48'],
+        ['ir_r3_k3_s2_e4_c96_se0.25'],
+        ['ir_r5_k3_s1_e6_c112_se0.25'],
+        ['ir_r8_k3_s2_e6_c192_se0.25'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier, round_limit=0.)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_s(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, rw=False, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Small model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+
+    NOTE: `rw` flag sets up 'small' variant to behave like my initial v2 small model,
+        before ref the impl was released.
+    """
+    arch_def = [
+        ['cn_r2_k3_s1_e1_c24_skip'],
+        ['er_r4_k3_s2_e4_c48'],
+        ['er_r4_k3_s2_e4_c64'],
+        ['ir_r6_k3_s2_e4_c128_se0.25'],
+        ['ir_r9_k3_s1_e6_c160_se0.25'],
+        ['ir_r15_k3_s2_e6_c256_se0.25'],
+    ]
+    num_features = 1280
+    if rw:
+        # my original variant, based on paper figure differs from the official release
+        arch_def[0] = ['er_r2_k3_s1_e1_c24']
+        arch_def[-1] = ['ir_r15_k3_s2_e6_c272_se0.25']
+        num_features = 1792
+
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
+        num_features=round_chs_fn(num_features),
+        stem_size=24,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Medium model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+
+    arch_def = [
+        ['cn_r3_k3_s1_e1_c24_skip'],
+        ['er_r5_k3_s2_e4_c48'],
+        ['er_r5_k3_s2_e4_c80'],
+        ['ir_r7_k3_s2_e4_c160_se0.25'],
+        ['ir_r14_k3_s1_e6_c176_se0.25'],
+        ['ir_r18_k3_s2_e6_c304_se0.25'],
+        ['ir_r5_k3_s1_e6_c512_se0.25'],
+    ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=1280,
+        stem_size=24,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_l(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Large model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+
+    arch_def = [
+        ['cn_r4_k3_s1_e1_c32_skip'],
+        ['er_r7_k3_s2_e4_c64'],
+        ['er_r7_k3_s2_e4_c96'],
+        ['ir_r10_k3_s2_e4_c192_se0.25'],
+        ['ir_r19_k3_s1_e6_c224_se0.25'],
+        ['ir_r25_k3_s2_e6_c384_se0.25'],
+        ['ir_r7_k3_s1_e6_c640_se0.25'],
+    ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=1280,
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_xl(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Xtra-Large model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+
+    arch_def = [
+        ['cn_r4_k3_s1_e1_c32_skip'],
+        ['er_r8_k3_s2_e4_c64'],
+        ['er_r8_k3_s2_e4_c96'],
+        ['ir_r16_k3_s2_e4_c192_se0.25'],
+        ['ir_r24_k3_s1_e6_c256_se0.25'],
+        ['ir_r32_k3_s2_e6_c512_se0.25'],
+        ['ir_r8_k3_s1_e6_c640_se0.25'],
+    ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=1280,
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MixNet Small model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+    Paper: https://arxiv.org/abs/1907.09595
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
+        # stage 3, 28x28 in
+        ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
+        # stage 4, 14x14in
+        ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
+        # stage 5, 14x14in
+        ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=1536,
+        stem_size=16,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MixNet Medium-Large model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+    Paper: https://arxiv.org/abs/1907.09595
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c24'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
+        # stage 3, 28x28 in
+        ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
+        # stage 4, 14x14in
+        ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
+        # stage 5, 14x14in
+        ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+        num_features=1536,
+        stem_size=24,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_tinynet(
+    variant, model_width=1.0, depth_multiplier=1.0, pretrained=False, **kwargs
+):
+    """Creates a TinyNet model.
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'], ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'], ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25'], ['ir_r4_k5_s2_e6_c192_se0.25'],
+        ['ir_r1_k3_s1_e6_c320_se0.25'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+        num_features=max(1280, round_channels(1280, model_width, 8, None)),
+        stem_size=32,
+        fix_stem=True,
+        round_chs_fn=partial(round_channels, multiplier=model_width),
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+
+@register_model
+def efficientnetv2_rw_t(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Tiny (Custom variant, tiny not in paper). """
+    model = _gen_efficientnetv2_s(
+        'efficientnetv2_rw_t', channel_multiplier=0.8, depth_multiplier=0.9, rw=False, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def gc_efficientnetv2_rw_t(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Tiny w/ Global Context Attn (Custom variant, tiny not in paper). """
+    model = _gen_efficientnetv2_s(
+        'gc_efficientnetv2_rw_t', channel_multiplier=0.8, depth_multiplier=0.9,
+        rw=False, se_layer='gc', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_rw_s(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Small (RW variant).
+    NOTE: This is my initial (pre official code release) w/ some differences.
+    See efficientnetv2_s and tf_efficientnetv2_s for versions that match the official w/ PyTorch vs TF padding
+    """
+    model = _gen_efficientnetv2_s('efficientnetv2_rw_s', rw=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_rw_m(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Medium (RW variant).
+    """
+    model = _gen_efficientnetv2_s(
+        'efficientnetv2_rw_m', channel_multiplier=1.2, depth_multiplier=(1.2,) * 4 + (1.6,) * 2, rw=True,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_s(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Small. """
+    model = _gen_efficientnetv2_s('efficientnetv2_s', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_m(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Medium. """
+    model = _gen_efficientnetv2_m('efficientnetv2_m', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_l(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Large. """
+    model = _gen_efficientnetv2_l('efficientnetv2_l', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_xl(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Xtra-Large. """
+    model = _gen_efficientnetv2_xl('efficientnetv2_xl', pretrained=pretrained, **kwargs)
+    return model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    model = efficientnetv2_rw_t(num_classes=1000, pretrained='imagenet')
+    model.cuda()
+    model.eval()
+    input = torch.randn(32, 3, 288, 288, device='cuda')
+    export_onnx_file = args.output_model
+
+    torch.onnx.export(model,        
+                    input,            
+                    export_onnx_file,       
+                    export_params=True,  
+                    opset_version=11,    
+                    do_constant_folding=True,  
+                    input_names = ['input'],   
+                    output_names = ['output'], ) 
+    print(" ") 
+    print('Model has been converted to ONNX') 
+    print("exit")
+    exit()
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/inference.py b/models/cv/classification/efficientnet_v2/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ec18b30cd51167fb8d7f2babc01430511ead3f
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/inference.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import json
+import os
+import re
+import time
+from tqdm import tqdm
+
+import cv2
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import tensorrt
+
+from calibration_dataset import getdataloader
+from common import eval_batch, create_engine_context, get_io_bindings
+
+def main(config):
+    dataloader = getdataloader(config.datasets_dir, config.loop_count, config.bsz, img_sz=config.imgsz)
+
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    # Load Engine && I/O bindings
+    engine, context = create_engine_context(config.engine_file, logger)
+    inputs, outputs, allocations = get_io_bindings(engine)
+
+    # Warm up
+    if config.warm_up > 0:
+        print("\nWarm Start.")
+        for i in range(config.warm_up):
+            context.execute_v2(allocations)
+        print("Warm Done.")
+
+    # Inference
+    if config.test_mode == "FPS":
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+        forward_time = end_time - start_time
+
+        num_samples = 50000
+        if config.loop_count * config.bsz < num_samples:
+            num_samples = config.loop_count * config.bsz
+        fps = num_samples / forward_time
+
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+        if fps >= config.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+    elif config.test_mode == "ACC":
+
+        ## Prepare the output data
+        output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        print(f"output shape : {output.shape} output type : {output.dtype}")
+
+        total_sample = 0
+        acc_top1, acc_top5 = 0, 0
+
+        with tqdm(total= len(dataloader)) as _tqdm:
+            for idx, (batch_data, batch_label) in enumerate(dataloader):
+                batch_data = batch_data.numpy().astype(inputs[0]["dtype"])
+                batch_data = np.ascontiguousarray(batch_data)
+                total_sample += batch_data.shape[0]
+
+                cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+                context.execute_v2(allocations)
+                cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+
+                # squeeze output shape [32,1000,1,1] to [32,1000] for mobilenet_v2 model
+                if len(output.shape) == 4:
+                    output = output.squeeze(axis=(2,3))
+
+                batch_top1, batch_top5 = eval_batch(output, batch_label)
+                acc_top1 += batch_top1
+                acc_top5 += batch_top5
+
+                _tqdm.set_postfix(acc_1='{:.4f}'.format(acc_top1/total_sample),
+                                    acc_5='{:.4f}'.format(acc_top5/total_sample))
+                _tqdm.update(1)
+
+        print(F"Acc@1 : {acc_top1/total_sample} = {acc_top1}/{total_sample}")
+        print(F"Acc@5 : {acc_top5/total_sample} = {acc_top5}/{total_sample}")
+        acc1 = acc_top1/total_sample
+        print(f"Accuracy Check : Test {acc1} >= target {config.acc_target}")
+        if acc1 >= config.acc_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+    parser.add_argument(
+        "--engine_file",
+        type=str,
+        help="engine file path"
+    )
+    parser.add_argument(
+        "--datasets_dir",
+        type=str,
+        default="",
+        help="ImageNet dir",
+    )
+    parser.add_argument("--warm_up", type=int, default=-1, help="warm_up times")
+    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=224,
+        help="inference size h,w",
+    )
+    parser.add_argument("--use_async", action="store_true")
+    parser.add_argument(
+        "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4"
+    )
+    parser.add_argument("--fps_target", type=float, default=-1.0)
+    parser.add_argument("--acc_target", type=float, default=-1.0)
+    parser.add_argument("--loop_count", type=int, default=-1)
+
+    config = parser.parse_args()
+    return config
+
+if __name__ == "__main__":
+    config = parse_config()
+    main(config)
diff --git a/models/cv/classification/efficientnet_v2/ixrt/modify_batchsize.py b/models/cv/classification/efficientnet_v2/ixrt/modify_batchsize.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ac42a3084920c449bb80494518c5fedc8c64316
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/modify_batchsize.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import onnx
+import argparse
+
+def change_input_dim(model, bsz):
+    batch_size = bsz
+
+    # The following code changes the first dimension of every input to be batch_size
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_size
+    inputs = model.graph.input
+    for input in inputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        if isinstance(batch_size, str):
+            # set dynamic batch size
+            dim1.dim_param = batch_size
+        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+            # set given batch size
+            dim1.dim_value = int(batch_size)
+        else:
+            # set batch size of 1
+            dim1.dim_value = 1
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int)
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_input_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
+
+    
+
+
+
diff --git a/models/cv/classification/efficientnet_v2/ixrt/quant.py b/models/cv/classification/efficientnet_v2/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c06eba295e22f7d61f354bf5f8f9c2014552485
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/quant.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+"""这是一个高度自动化的 PPQ 量化的入口脚本，将你的模型和数据按要求进行打包:
+
+在自动化 API 中，我们使用 QuantizationSetting 对象传递量化参数。
+
+This file will show you how to quantize your network with PPQ
+    You should prepare your model and calibration dataset as follow:
+
+    ~/working/model.onnx                          <--  your model
+    ~/working/data/*.npy or ~/working/data/*.bin  <--  your dataset
+
+if you are using caffe model:
+    ~/working/model.caffemdoel  <--  your model
+    ~/working/model.prototext   <--  your model
+
+### MAKE SURE YOUR INPUT LAYOUT IS [N, C, H, W] or [C, H, W] ###
+
+quantized model will be generated at: ~/working/quantized.onnx
+"""
+from ppq import *
+from ppq.api import *
+import os
+from calibration_dataset import getdataloader
+import argparse
+import random
+import numpy as np
+import torch
+
+def setseed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--dataset_dir", type=str, default="imagenet_val")
+    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"],
+                        default="hist_percentile")
+    parser.add_argument("--disable_quant_names", nargs='*', type=str)
+    parser.add_argument("--save_dir", type=str, help="save path", default=None)
+    parser.add_argument("--bsz", type=int, default=32)
+    parser.add_argument("--step", type=int, default=20)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--imgsz", type=int, default=288)
+    args = parser.parse_args()
+    print("Quant config:", args)
+    print(args.disable_quant_names)
+    return args
+
+
+config = parse_args()
+
+# modify configuration below:
+WORKING_DIRECTORY = 'checkpoints'  # choose your working directory
+TARGET_PLATFORM = TargetPlatform.TRT_INT8  # choose your target platform
+MODEL_TYPE = NetworkFramework.ONNX  # or NetworkFramework.CAFFE
+INPUT_LAYOUT = 'chw'  # input data layout, chw or hwc
+NETWORK_INPUTSHAPE = [32, 3, 288, 288]  # input shape of your network
+EXECUTING_DEVICE = 'cuda'  # 'cuda' or 'cpu'.
+REQUIRE_ANALYSE = False
+TRAINING_YOUR_NETWORK = False  # 是否需要 Finetuning 一下你的网络
+# -------------------------------------------------------------------
+# 加载你的模型文件，PPQ 将会把 onnx 或者 caffe 模型文件解析成自己的格式
+# 如果你正使用 pytorch, tensorflow 等框架，你可以先将模型导出成 onnx
+# 使用 torch.onnx.export 即可，如果你在导出 torch 模型时发生错误，欢迎与我们联系。
+# -------------------------------------------------------------------
+graph = None
+if MODEL_TYPE == NetworkFramework.ONNX:
+    graph = load_onnx_graph(onnx_import_file=config.model)
+if MODEL_TYPE == NetworkFramework.CAFFE:
+    graph = load_caffe_graph(
+        caffemodel_path=os.path.join(WORKING_DIRECTORY, 'model.caffemodel'),
+        prototxt_path=os.path.join(WORKING_DIRECTORY, 'model.prototxt'))
+assert graph is not None, 'Graph Loading Error, Check your input again.'
+
+# -------------------------------------------------------------------
+# SETTING 对象用于控制 PPQ 的量化逻辑，主要描述了图融合逻辑、调度方案、量化细节策略等
+# 当你的网络量化误差过高时，你需要修改 SETTING 对象中的属性来进行特定的优化
+# -------------------------------------------------------------------
+QS = QuantizationSettingFactory.default_setting()
+
+# -------------------------------------------------------------------
+# 下面向你展示了如何使用 finetuning 过程提升量化精度
+# 在 PPQ 中我们提供了十余种算法用来帮助你恢复精度
+# 开启他们的方式都是 QS.xxxx = True
+# 按需使用，不要全部打开，容易起飞
+# -------------------------------------------------------------------
+if TRAINING_YOUR_NETWORK:
+    QS.lsq_optimization = True  # 启动网络再训练过程，降低量化误差
+    QS.lsq_optimization_setting.steps = 500  # 再训练步数，影响训练时间，500 步大概几分钟
+    QS.lsq_optimization_setting.collecting_device = 'cuda'  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
+
+
+
+dataloader = getdataloader(config.dataset_dir, config.step, config.bsz, img_sz=config.imgsz)
+# ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
+# 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
+with ENABLE_CUDA_KERNEL():
+    print('网络正量化中，根据你的量化配置，这将需要一段时间:')
+    quantized = quantize_native_model(
+        setting=QS,  # setting 对象用来控制标准量化逻辑
+        model=graph,
+        calib_dataloader=dataloader,
+        calib_steps=config.step,
+        input_shape=NETWORK_INPUTSHAPE,  # 如果你的网络只有一个输入，使用这个参数传参
+        inputs=None,
+        # 如果你的网络有多个输入，使用这个参数传参，就是 input_shape=None, inputs=[torch.zeros(1,3,224,224), torch.zeros(1,3,224,224)]
+        collate_fn=lambda x: x[0].to(EXECUTING_DEVICE),  # collate_fn 跟 torch dataloader 的 collate fn 是一样的，用于数据预处理，
+        # 你当然也可以用 torch dataloader 的那个，然后设置这个为 None
+        platform=TARGET_PLATFORM,
+        device=EXECUTING_DEVICE,
+        do_quantize=True)
+
+    # -------------------------------------------------------------------
+    # 如果你需要执行量化后的神经网络并得到结果，则需要创建一个 executor
+    # 这个 executor 的行为和 torch.Module 是类似的，你可以利用这个东西来获取执行结果
+    # 请注意，必须在 export 之前执行此操作。
+    # -------------------------------------------------------------------
+    executor = TorchExecutor(graph=quantized, device=EXECUTING_DEVICE)
+    # output = executor.forward(input)
+
+    # -------------------------------------------------------------------
+    # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
+    # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
+    # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
+    # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
+    # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
+    # -------------------------------------------------------------------
+    print('正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:')
+    reports = graphwise_error_analyse(
+        graph=quantized, running_device=EXECUTING_DEVICE, steps=32,
+        dataloader=dataloader, collate_fn=lambda x: x[0].to(EXECUTING_DEVICE))
+    for op, snr in reports.items():
+        if snr > 0.1: ppq_warning(f'层 {op} 的累计量化误差显著，请考虑进行优化')
+
+    if REQUIRE_ANALYSE:
+        print('正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:')
+        layerwise_error_analyse(graph=quantized, running_device=EXECUTING_DEVICE,
+                                interested_outputs=None,
+                                dataloader=dataloader, collate_fn=lambda x: x.to(EXECUTING_DEVICE))
+
+    # -------------------------------------------------------------------
+    # 使用 export_ppq_graph 函数来导出量化后的模型
+    # PPQ 会根据你所选择的导出平台来修改模型格式
+    # -------------------------------------------------------------------
+    print('网络量化结束，正在生成目标文件:')
+    export_ppq_graph(
+        graph=quantized, platform=TARGET_PLATFORM,
+        graph_save_to=os.path.join(config.save_dir, f"quantized_{config.model_name}.onnx"),
+        config_save_to=os.path.join(config.save_dir, 'quant_cfg.json'))
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/refine_model.py b/models/cv/classification/efficientnet_v2/ixrt/refine_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..000ee4dcbf3df294a34cd83c97527bba00024ac7
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/refine_model.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import argparse
+import dataclasses
+
+import torch
+import onnx
+
+from refine_utils.matmul_to_gemm_pass import FusedGemmPass
+from refine_utils.linear_pass import FusedLinearPass
+
+from refine_utils.common import *
+
+def get_constant_input_name_of_operator(graph: Graph, operator: Operator):
+    const = None
+    for input in operator.inputs:
+        if not graph.containe_var(input):
+            continue
+
+        if not graph.is_leaf_variable(input):
+            continue
+
+        input_var = graph.get_variable(input)
+        if input_var.value is not None:
+            const = input
+    return const 
+
+class FuseLayerNormPass(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        self.transform = GraphTransform(graph)
+        find_sequence_subgraph(
+            graph,
+            [OP.REDUCE_MEAN, OP.SUB, OP.POW, OP.REDUCE_MEAN, OP.ADD, OP.SQRT, OP.DIV, OP.MUL, OP.ADD],
+            self.fuse_layer_norm,
+            strict=False
+        )
+        return graph
+
+    def fuse_layer_norm(self, graph: Graph, pattern: PatternGraph):
+        # 检查 REDUCE_MEAN 的输入是否和 SUB 的输入是一致的
+        if pattern.nodes[0].operator.inputs[0] != pattern.nodes[1].operator.inputs[0]:
+            return
+
+        # 检查 POW 的输入是否和 DIV 的输入是一致的
+        if pattern.nodes[2].operator.inputs[0] != pattern.nodes[6].operator.inputs[0]:
+            return
+
+        # 检查部分算子的输出是否被多个算子使用
+        nodes = pattern.nodes
+        for node in [nodes[0]] + nodes[2:-1]:
+            next_ops = graph.get_next_operators(node.operator)
+            if len(next_ops) > 1:
+                return
+
+        eps = None
+        for input in nodes[4].operator.inputs:
+            input_var = graph.get_variable(input)
+            if input_var.value is not None and graph.is_leaf_variable(input):
+                eps = to_py_type(input_var.value)
+
+        scale = get_constant_input_name_of_operator(graph, nodes[-2].operator)
+        bias = get_constant_input_name_of_operator(graph, nodes[-1].operator)
+
+        self.transform.delete_operators_between_op_op(nodes[0].operator, nodes[-1].operator)
+        
+        bias_var = graph.get_variable(bias)
+        print(bias_var)
+        
+        attributes = {
+            "axis": nodes[0].operator.attributes.axes,
+            "epsilon": eps,
+        }
+        
+        
+        layer_norm_op = self.transform.make_operator(
+            op_type="LayerNormalization",
+            inputs=[nodes[0].operator.inputs[0], scale, bias],
+            outputs=[nodes[-1].operator.outputs[0]],
+            **attributes
+        )
+
+        self.transform.add_operator(layer_norm_op)
+
+class FusedGeluPass(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        self.transform = GraphTransform(graph)
+
+        find_sequence_subgraph(
+            graph, pattern=[OP.DIV, OP.ERF, OP.ADD, OP.MUL, OP.MUL], callback=self.fuse_gelu, strict=True
+        )
+        return graph
+
+    def fuse_gelu(self, graph: Graph, pattern: PatternGraph):
+        nodes = pattern.nodes
+        prev_op = self.transform.get_previous_operators(nodes[0].operator)[0]
+        next_ops = self.transform.get_next_operators(prev_op)
+        if len(next_ops) != 2:
+            return
+
+        if nodes[0].operator not in next_ops or nodes[3].operator not in next_ops:
+            return
+
+        gelu_op_input = None
+        for input in nodes[3].operator.inputs:
+            if input in nodes[0].operator.inputs:
+                gelu_op_input = input
+                break
+
+        self.transform.delete_operators_between_op_op(nodes[0].operator, nodes[-1].operator)
+
+        gelu_op = self.transform.make_operator(
+            op_type=OP.GELU,
+            inputs=[gelu_op_input],
+            outputs=[nodes[-1].operator.outputs[0]]
+        )
+        self.transform.add_operator(gelu_op)
+
+@dataclasses.dataclass
+class NormalizeAttr(BaseOperatorAttr):
+    p: float = 2.0
+    epsilon: float = 1e-12
+    axis: int = 1
+
+
+@registe_operator(OP.GELU)
+class GeluOperator(BaseOperator):
+
+    def call(
+        self,
+        executor,
+        operator: Operator,
+        inputs: List,
+        attr: NormalizeAttr,
+    ):
+        return F.gelu(inputs[0])
+
+    def convert_onnx_operator(
+        self, ir_graph: Graph, onnx_graph: onnx.GraphProto, node: onnx.NodeProto
+    ) -> Operator:
+        return default_converter(ir_graph, onnx_graph, node, attr_cls=attr.EmptyAttr)
+
+    def quantize(
+        self,
+        graph: Graph,
+        op: Operator,
+        operator_observer_config: QuantOperatorObserverConfig,
+        quant_outputs: bool = False,
+    ):
+        return quant_single_input_operator(graph, op, operator_observer_config, quant_outputs=quant_outputs)
+
+
+
+class ClearUnsedVariables(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        vars = list(graph.variables)
+
+        for var in vars:
+            if len(graph.get_dst_operators(var)) == 0 and graph.is_leaf_variable(var):
+                graph.delete_variable(var)
+
+        quant_params = list(graph.quant_parameters.keys())
+        for var in quant_params:
+            if not graph.containe_var(var):
+                graph.quant_parameters.pop(var)
+
+        return graph
+
+class FormatLayerNorm(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        for op in graph.operators.values():
+            if "LayerNorm" in op.op_type:
+                self.format_layer_norm(graph, op)
+        return graph
+
+    def format_layer_norm(self, graph, operator):
+        if not hasattr(operator.attributes, "axis"):
+            return
+        if isinstance(operator.attributes.axis, (tuple, list)):
+            operator.attributes.axis = operator.attributes.axis[0]
+
+class FormatReshape(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        for op in graph.operators.values():
+            if op.op_type == "Reshape":
+                self.format_reshape(graph, op)
+
+        return graph
+
+    def format_reshape(self, graph, operator):
+        shape = graph.get_variable(operator.inputs[1])
+        shape.value = torch.tensor(shape.value, dtype=torch.int64)
+
+class FormatScalar(BasePass):
+
+    def process(self, graph: Graph):
+        for var in graph.variables.values():
+            var: Variable
+            use_ops = graph.get_dst_operators(var)
+
+            if len(use_ops) == 0:
+                continue
+
+            if use_ops[0].op_type not in [OP.MUL, OP.ADD, OP.GATHER]:
+                continue
+
+            if var.value is not None and var.value.ndim == 0:
+                var.value = var.value.reshape(1)
+                print(f"Reshape scalar to tensor for {var.name}.")
+
+        return graph
+
+class RenamePass(BasePass):
+
+    def process(self, graph:Graph):
+
+        names = [name for name in graph.operators.keys()]
+        for old_name in names:
+            new_name = old_name.replace("/", "#")
+
+            graph.rename_operator(old_name, new_name)
+
+        names = [name for name in graph.variables.keys()]
+        for name in names:
+            new_name = name.replace("/", ".").replace("Output", "out").replace("output", "out")
+
+            graph.rename_vaiable(name, new_name,
+                                with_variables=True, 
+                                with_operator_outputs=True)
+
+        return graph
+
+def create_pipeline(example_inputs):
+    return PassSequence(
+        # FuseLayerNormPass(),
+        FusedGeluPass(),
+
+        # ClearUnsedVariables(),
+        # FormatLayerNorm(),
+        # FormatReshape(),
+        # FormatScalar(),
+        # RenamePass()
+    )
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--onnx_path", type=str)
+    parser.add_argument("--dst_onnx_path", type=str)
+
+    parser.add_argument("--bsz", type=int, default=8,
+                        help="Batch size")
+    parser.add_argument("--imgsz", type=int, default=224,
+                        help="Image size")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    example_inputs = torch.randn(args.bsz, 3, args.imgsz, args.imgsz)
+
+    refine_pipline = Pipeline(
+        create_source(f"{args.onnx_path}", example_inputs=example_inputs),
+        create_pipeline(example_inputs),
+        create_target(
+            f"{args.dst_onnx_path}",
+            example_inputs=example_inputs,
+        )
+    )
+    refine_pipline.run()
+
+    print(f"refine the model, input shape={example_inputs.shape}")
diff --git a/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_fp16_accuracy.sh b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_fp16_accuracy.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e62cc5d702156b26432b9eb892aecd9c0432be16
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_fp16_accuracy.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py \
+    --origin_model $ORIGINE_MODEL    \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_fp16_performance.sh b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_fp16_performance.sh
new file mode 100755
index 0000000000000000000000000000000000000000..05c9986fc0a45b12847ed9c333f014f170afcdcf
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_fp16_performance.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py \
+    --origin_model $ORIGINE_MODEL    \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --fps_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_int8_accuracy.sh b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_int8_accuracy.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a58f44b3b55f714b3ed117a13d513b79c42e8929
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_int8_accuracy.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -x
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+echo ${QUANT_OBSERVER}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+     echo "  "Simplify Model, ${SIM_MODEL} has been existed
+ else
+     python3 ${RUN_DIR}/simplify_model.py \
+     --origin_model $ORIGINE_MODEL    \
+     --output_model ${SIM_MODEL}
+     echo "  "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+     python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+         --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+     echo "  "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+     python3 ${RUN_DIR}/build_i8_engine.py          \
+         --onnx ${FINAL_MODEL}                    \
+         --qparam_json ${CHECKPOINTS_DIR}/quant_cfg.json \
+         --engine ${ENGINE_FILE}
+     echo "  "Generate Engine ${ENGINE_FILE}
+ fi
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_int8_performance.sh b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_int8_performance.sh
new file mode 100755
index 0000000000000000000000000000000000000000..07872405b8324d1eed6ba108d699a76c089c0d84
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/scripts/infer_efficientnet_int8_performance.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+echo ${QUANT_OBSERVER}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+     echo "  "Simplify Model, ${SIM_MODEL} has been existed
+ else
+     python3 ${RUN_DIR}/simplify_model.py \
+     --origin_model $ORIGINE_MODEL    \
+     --output_model ${SIM_MODEL}
+     echo "  "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+     python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+         --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+     echo "  "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+     python3 ${RUN_DIR}/build_i8_engine.py          \
+         --onnx ${FINAL_MODEL}                    \
+         --qparam_json ${CHECKPOINTS_DIR}/quant_cfg.json \
+         --engine ${ENGINE_FILE}
+     echo "  "Generate Engine ${ENGINE_FILE}
+ fi
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/ixrt/simplify_model.py b/models/cv/classification/efficientnet_v2/ixrt/simplify_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d53a474011539600c4cf2b92617fa4e51e18273
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/ixrt/simplify_model.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import onnx
+import argparse
+from onnxsim import simplify
+
+# Simplify
+def simplify_model(args):
+    onnx_model = onnx.load(args.origin_model)
+    model_simp, check = simplify(onnx_model)
+    model_simp = onnx.shape_inference.infer_shapes(model_simp)
+    onnx.save(model_simp, args.output_model)
+    print("  Simplify onnx Done.")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    parser.add_argument("--reshape", action="store_true")
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+simplify_model(args)
+    
+
+
+
diff --git a/models/cv/classification/hrnet_w18/ixrt/README.md b/models/cv/classification/hrnet_w18/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b856db0d43ca6a235ee75ce1d836dad549712a9f
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/README.md
@@ -0,0 +1,62 @@
+# HRNet-W18
+
+## Description
+HRNet-W18 is a powerful image classification model developed by Jingdong AI Research and released in 2020. It belongs to the HRNet (High-Resolution Network) family of models, known for their exceptional performance in various computer vision tasks.
+
+## Setup
+
+### Install
+```bash
+yum install mesa-libGL
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install tabulate
+pip3 install ppq
+pip3 install mmpretrain
+pip3 install mmcv-lite
+```
+
+### Download
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+```bash
+mkdir checkpoints
+python3 export_onnx.py --output_model checkpoints/hrnet-w18.onnx
+```
+
+## Inference
+```bash
+export DATASETS_DIR=/path/to/imagenet_val/
+export CHECKPOINTS_DIR=./checkpoints
+export RUN_DIR=./
+export CONFIG_DIR=config/HRNET_W18_CONFIG
+```
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_hrnet_w18_fp16_accuracy.sh
+# Performance
+bash scripts/infer_hrnet_w18_fp16_performance.sh
+```
+
+### INT8
+```bash
+# Accuracy
+bash scripts/infer_hrnet_w18_int8_accuracy.sh
+# Performance
+bash scripts/infer_hrnet_w18_int8_performance.sh
+```
+
+## Results
+
+Model    |BatchSize  |Precision |FPS       |Top-1(%)  |Top-5(%)
+---------|-----------|----------|----------|----------|--------
+ResNet50 |           |          |          |          |      
+ResNet50 |           |          |          |          |      
+
+
diff --git a/models/cv/classification/hrnet_w18/ixrt/build_engine.py b/models/cv/classification/hrnet_w18/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e126bc715aa77d38c3abdd1e02191a262689e7
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/build_engine.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from calibration_dataset import getdataloader
+import cuda.cudart as cudart
+
+def assertSuccess(err):
+    assert(err == cudart.cudaError_t.cudaSuccess)
+
+class EngineCalibrator(tensorrt.IInt8EntropyCalibrator2):
+
+    def __init__(self, cache_file, datasets_dir, loop_count=10, bsz=1, img_sz=224):
+        super().__init__()
+        self.cache_file = cache_file
+        self.image_batcher  = getdataloader(datasets_dir, loop_count, batch_size=bsz, img_sz=img_sz)
+        self.batch_generator = iter(self.image_batcher)
+        size = img_sz*img_sz*3*bsz
+        __import__('pdb').set_trace()
+        err, self.batch_allocation = cudart.cudaMalloc(size)
+        assertSuccess(err)
+
+    def __del__(self):
+        err,= cudart.cudaFree(self.batch_allocation)
+        assertSuccess(err)
+
+    def get_batch_size(self):
+        return self.image_batcher.batch_size
+
+    def get_batch(self, names):
+        try:
+            batch, _ = next(self.batch_generator)
+            batch = batch.numpy()
+            __import__('pdb').set_trace()
+            cudart.cudaMemcpy(self.batch_allocation,
+                              np.ascontiguousarray(batch),
+                              batch.nbytes,
+                              cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+            return [int(self.batch_allocation)]
+        except StopIteration:
+            return None
+
+    def read_calibration_cache(self):
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, "rb") as f:
+                return f.read()
+
+    def write_calibration_cache(self, cache):
+        with open(self.cache_file, "wb") as f:
+            f.write(cache)
+
+def main(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.VERBOSE)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    print("precision : ", precision)
+    build_config.set_flag(precision)
+    if config.precision == "int8":
+        build_config.int8_calibrator = EngineCalibrator("int8_cache", config.datasets_dir)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--engine", type=str, default=None)
+    parser.add_argument(
+        "--datasets_dir",
+        type=str,
+        default="",
+        help="ImageNet dir",
+    )
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    # cali = EngineCalibrator("tmp", "/home/qiang.zhang/data/imagenet_val/")
+    # print(cali.get_batch_size())
+    # print(cali.get_batch("hello"))
+    args = parse_args()
+    main(args)
diff --git a/models/cv/classification/hrnet_w18/ixrt/build_i8_engine.py b/models/cv/classification/hrnet_w18/ixrt/build_i8_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..04477118efb75d3a16da66062eefcea23e5bb421
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/build_i8_engine.py
@@ -0,0 +1,38 @@
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+
+def main(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--engine", type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/cv/classification/hrnet_w18/ixrt/calibration_dataset.py b/models/cv/classification/hrnet_w18/ixrt/calibration_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f36c84597335e7b393b655a7f7a1d10fdd3cba0c
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/calibration_dataset.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from torchvision import models
+from torchvision import transforms as T
+
+
+class CalibrationImageNet(torchvision.datasets.ImageFolder):
+    def __init__(self, *args, **kwargs):
+        super(CalibrationImageNet, self).__init__(*args, **kwargs)
+        img2label_path = os.path.join(self.root, "val_map.txt")
+        if not os.path.exists(img2label_path):
+            raise FileNotFoundError(f"Not found label file `{img2label_path}`.")
+
+        self.img2label_map = self.make_img2label_map(img2label_path)
+
+    def make_img2label_map(self, path):
+        with open(path) as f:
+            lines = f.readlines()
+
+        img2lable_map = dict()
+        for line in lines:
+            line = line.lstrip().rstrip().split("\t")
+            if len(line) != 2:
+                continue
+            img_name, label = line
+            img_name = img_name.strip()
+            if img_name in [None, ""]:
+                continue
+            label = int(label.strip())
+            img2lable_map[img_name] = label
+        return img2lable_map
+
+    def __getitem__(self, index):
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        # if self.target_transform is not None:
+        #     target = self.target_transform(target)
+        img_name = os.path.basename(path)
+        target = self.img2label_map[img_name]
+
+        return sample, target
+
+
+def create_dataloaders(data_path, num_samples=1024, img_sz=224, batch_size=2, workers=0):
+    dataset = CalibrationImageNet(
+        data_path,
+        transform=T.Compose(
+            [
+                T.Resize(256),
+                T.CenterCrop(img_sz),
+                T.ToTensor(),
+                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        ),
+    )
+
+    calibration_dataset = dataset
+    if num_samples is not None:
+        calibration_dataset = torch.utils.data.Subset(
+            dataset, indices=range(num_samples)
+        )
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=False,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+
+    verify_dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+
+    return calibration_dataloader, verify_dataloader
+
+
+def getdataloader(dataset_dir, step=20, batch_size=32, workers=2, img_sz=224, total_sample=50000):
+    num_samples = min(total_sample, step * batch_size)
+    if step < 0:
+        num_samples = None
+    calibration_dataloader, _ = create_dataloaders(
+        dataset_dir,
+        img_sz=img_sz,
+        batch_size=batch_size,
+        workers=workers,
+        num_samples=num_samples,
+    )
+    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/classification/hrnet_w18/ixrt/common.py b/models/cv/classification/hrnet_w18/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0458195e5b7980ce70585d7284ca8a875afa3fd6
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/common.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import pycuda.driver as cuda
+
+def eval_batch(batch_score, batch_label):
+    batch_score = torch.tensor(torch.from_numpy(batch_score), dtype=torch.float32)
+    values, indices = batch_score.topk(5)
+    top1, top5 = 0, 0
+    for idx, label in enumerate(batch_label):
+
+        if label == indices[idx][0]:
+            top1 += 1
+        if label in indices[idx]:
+            top5 += 1
+    return top1, top5
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
diff --git a/models/cv/classification/hrnet_w18/ixrt/config/HRNET_W18_CONFIG b/models/cv/classification/hrnet_w18/ixrt/config/HRNET_W18_CONFIG
new file mode 100644
index 0000000000000000000000000000000000000000..d419e535aab4e798ac4f5d08f9a0f5a6561d49fc
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/config/HRNET_W18_CONFIG
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+# IMGSIZE : 模型输入hw大小
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件名称
+IMGSIZE=224
+MODEL_NAME=HRNet_W18
+ORIGINE_MODEL=hrnet-w18.onnx
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+    # QUANT_OBSERVER : 量化策略，可选 [hist_percentile, percentile, minmax, entropy, ema]
+    # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致，有些op可能推导shape错误(比如Reshape)
+    # QUANT_STEP : 量化步数
+    # QUANT_SEED : 随机种子 保证量化结果可复现
+    # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=minmax
+QUANT_BATCHSIZE=32
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=
+QUANT_EXIST_ONNX=
diff --git a/models/cv/classification/hrnet_w18/ixrt/export_onnx.py b/models/cv/classification/hrnet_w18/ixrt/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceb3e7e156b63a893a24f0ea9b20dd42eacb7c97
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/export_onnx.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (leoxiaobin@gmail.com)
+# Modified by Bowen Cheng (bcheng9@illinois.edu)
+# ------------------------------------------------------------------------------
+
+import torch
+from mmpretrain import get_model
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+model = get_model('hrnet-w18_3rdparty_8xb32_in1k', pretrained=True)
+model.cuda()
+model.eval()
+input = torch.randn(32, 3, 224, 224, device='cuda')
+export_onnx_file = args.output_model
+
+torch.onnx.export(model,        
+                  input,            
+                  export_onnx_file,       
+                  export_params=True,  
+                  opset_version=11,    
+                  do_constant_folding=True,  
+                  input_names = ['input'],   
+                  output_names = ['output'],) 
+print(" ") 
+print('Model has been converted to ONNX') 
+print("exit")
+exit()
\ No newline at end of file
diff --git a/models/cv/classification/hrnet_w18/ixrt/inference.py b/models/cv/classification/hrnet_w18/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..86f0cdf26617350b269a3f17d875869740f1ff02
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/inference.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import json
+import os
+import re
+import time
+from tqdm import tqdm
+
+import cv2
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import tensorrt
+
+from calibration_dataset import getdataloader
+from common import eval_batch, create_engine_context, get_io_bindings
+
+def main(config):
+    dataloader = getdataloader(config.datasets_dir, config.loop_count, config.bsz, img_sz=config.imgsz)
+
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    # Load Engine && I/O bindings
+    engine, context = create_engine_context(config.engine_file, logger)
+    inputs, outputs, allocations = get_io_bindings(engine)
+
+    # Warm up
+    if config.warm_up > 0:
+        print("\nWarm Start.")
+        for i in range(config.warm_up):
+            context.execute_v2(allocations)
+        print("Warm Done.")
+
+    # Inference
+    if config.test_mode == "FPS":
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+        forward_time = end_time - start_time
+
+        num_samples = 50000
+        if config.loop_count * config.bsz < num_samples:
+            num_samples = config.loop_count * config.bsz
+        fps = num_samples / forward_time
+
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+        if fps >= config.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+    elif config.test_mode == "ACC":
+
+        ## Prepare the output data
+        output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        print(f"output shape : {output.shape} output type : {output.dtype}")
+
+        total_sample = 0
+        acc_top1, acc_top5 = 0, 0
+
+        with tqdm(total= len(dataloader)) as _tqdm:
+            for idx, (batch_data, batch_label) in enumerate(dataloader):
+                batch_data = batch_data.numpy().astype(inputs[0]["dtype"])
+                batch_data = np.ascontiguousarray(batch_data)
+                total_sample += batch_data.shape[0]
+
+                cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+                context.execute_v2(allocations)
+                cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+
+                # squeeze output shape [32,1000,1,1] to [32,1000] for mobilenet_v2 model
+                if len(output.shape) == 4:
+                    output = output.squeeze(axis=(2,3))
+
+                batch_top1, batch_top5 = eval_batch(output, batch_label)
+                acc_top1 += batch_top1
+                acc_top5 += batch_top5
+
+                _tqdm.set_postfix(acc_1='{:.4f}'.format(acc_top1/total_sample),
+                                    acc_5='{:.4f}'.format(acc_top5/total_sample))
+                _tqdm.update(1)
+
+        print(F"Acc@1 : {acc_top1/total_sample} = {acc_top1}/{total_sample}")
+        print(F"Acc@5 : {acc_top5/total_sample} = {acc_top5}/{total_sample}")
+        acc1 = acc_top1/total_sample
+        print(f"Accuracy Check : Test {acc1} >= target {config.acc_target}")
+        if acc1 >= config.acc_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+    parser.add_argument(
+        "--engine_file",
+        type=str,
+        help="engine file path"
+    )
+    parser.add_argument(
+        "--datasets_dir",
+        type=str,
+        default="",
+        help="ImageNet dir",
+    )
+    parser.add_argument("--warm_up", type=int, default=-1, help="warm_up times")
+    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=224,
+        help="inference size h,w",
+    )
+    parser.add_argument("--use_async", action="store_true")
+    parser.add_argument(
+        "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4"
+    )
+    parser.add_argument("--fps_target", type=float, default=-1.0)
+    parser.add_argument("--acc_target", type=float, default=-1.0)
+    parser.add_argument("--loop_count", type=int, default=-1)
+
+    config = parser.parse_args()
+    return config
+
+if __name__ == "__main__":
+    config = parse_config()
+    main(config)
\ No newline at end of file
diff --git a/models/cv/classification/hrnet_w18/ixrt/modify_batchsize.py b/models/cv/classification/hrnet_w18/ixrt/modify_batchsize.py
new file mode 100644
index 0000000000000000000000000000000000000000..689b7a972dcbfec77c185592ede16bb4f04fa4fd
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/modify_batchsize.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import onnx
+import argparse
+
+def change_input_dim(model, bsz):
+    batch_size = bsz
+
+    # The following code changes the first dimension of every input to be batch_size
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_size
+    inputs = model.graph.input
+    for input in inputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        if isinstance(batch_size, str):
+            # set dynamic batch size
+            dim1.dim_param = batch_size
+        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+            # set given batch size
+            dim1.dim_value = int(batch_size)
+        else:
+            # set batch size of 1
+            dim1.dim_value = 1
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int)
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_input_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
+
+    
+
+
+
diff --git a/models/cv/classification/hrnet_w18/ixrt/quant_qdq.py b/models/cv/classification/hrnet_w18/ixrt/quant_qdq.py
new file mode 100644
index 0000000000000000000000000000000000000000..8006db24562eaa36c59d77711e3e987c5b7ad38f
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/quant_qdq.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import cv2
+import random
+import argparse
+import numpy as np
+from random import shuffle
+from tensorrt.deploy import static_quantize
+
+import torch
+import torchvision.datasets
+from calibration_dataset import getdataloader
+
+def setseed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--dataset_dir", type=str, default="imagenet_val")
+    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+    parser.add_argument("--disable_quant_names", nargs='*', type=str)
+    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
+    parser.add_argument("--bsz", type=int, default=32)
+    parser.add_argument("--step", type=int, default=20)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--imgsz", type=int, default=224)
+    args = parser.parse_args()
+    print("Quant config:", args)
+    print(args.disable_quant_names)
+    return args
+
+args = parse_args()
+setseed(args.seed)
+calibration_dataloader = getdataloader(args.dataset_dir, args.step, args.bsz, img_sz=args.imgsz)
+static_quantize(args.model,
+        calibration_dataloader=calibration_dataloader,
+        save_quant_onnx_path=os.path.join(args.save_dir, f"quantized_{args.model_name}.onnx"),
+        observer=args.observer,
+        data_preprocess=lambda x: x[0].to("cuda"),
+        quant_format="qdq",
+        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_fp16_accuracy.sh b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b743d7084ae058118c29daaf494769fc293ceb41
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_fp16_accuracy.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py \
+    --origin_model $ORIGINE_MODEL    \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_fp16_performance.sh b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e7a4f1a7276406a0ed7400af4368b5bec2a06e06
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_fp16_performance.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py \
+    --origin_model $ORIGINE_MODEL    \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --fps_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_int8_accuracy.sh b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8e4a82bd58dccb94327021f7dd7d593f79b36d0c
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_int8_accuracy.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+set -x
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=int8
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+echo ${QUANT_OBSERVER}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+     echo "  "Simplify Model, ${SIM_MODEL} has been existed
+ else
+     python3 ${RUN_DIR}/simplify_model.py \
+     --origin_model $ORIGINE_MODEL    \
+     --output_model ${SIM_MODEL}
+     echo "  "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+         python3 ${RUN_DIR}/quant_qdq.py       \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+     python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+         --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+     echo "  "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+     python3 ${RUN_DIR}/build_i8_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+     echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_int8_performance.sh b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..714a8284389203f9542bd6569fa1682f497fb756
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/scripts/infer_hrnet_w18_int8_performance.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=int8
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+echo ${QUANT_OBSERVER}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+     echo "  "Simplify Model, ${SIM_MODEL} has been existed
+ else
+     python3 ${RUN_DIR}/simplify_model.py \
+     --origin_model $ORIGINE_MODEL    \
+     --output_model ${SIM_MODEL}
+     echo "  "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+     python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+         --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+     echo "  "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+     python3 ${RUN_DIR}/build_i8_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+     echo "  "Generate Engine ${ENGINE_FILE}
+ fi
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/classification/hrnet_w18/ixrt/simplify_model.py b/models/cv/classification/hrnet_w18/ixrt/simplify_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9948a9fa083ff99ff88e556e96614b02cccaa965
--- /dev/null
+++ b/models/cv/classification/hrnet_w18/ixrt/simplify_model.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import onnx
+import argparse
+from onnxsim import simplify
+
+# Simplify
+def simplify_model(args):
+    onnx_model = onnx.load(args.origin_model)
+    model_simp, check = simplify(onnx_model)
+    model_simp = onnx.shape_inference.infer_shapes(model_simp)
+    onnx.save(model_simp, args.output_model)
+    print("  Simplify onnx Done.")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    parser.add_argument("--reshape", action="store_true")
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+simplify_model(args)
+    
+
+
+
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/README.md b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..35734aae9e3be8a0f0ccd505689775dbc14dc524
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md
@@ -0,0 +1,52 @@
+# Baichuan-2-7B 
+
+## Description
+Baichuan 2 is a new generation open-source large language model launched by Baichuan Intelligence. It is trained on high-quality data with 26 trillion tokens, which sounds like a substantial dataset. Baichuan 2 achieves state-of-the-art performance on various authoritative Chinese, multilingual, and domain-specific benchmarks of similar size, indicating its excellent capabilities in language understanding and generation.This release includes Base and Chat versions of 7B. 
+
+## Setup
+
+### Install
+In order to run the model smoothly, we need the following dependency files:
+1. ixrt-xxx.whl
+2. ixformer-xxx.whl
+3. vllm-xxx.whl
+Please contact the staff to obtain the relevant installation packages.
+
+```bash
+yum install mesa-libGL
+pip3 install transformers==4.33.2
+pip3 install Path/To/ixrt-xxx.whl
+pip3 install Path/To/vllm-xxx.whl
+pip3 install Path/To/ixformer-xxx.whl
+```
+
+### Download
+Pretrained model: <https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main>
+
+```bash
+mkdir /data/baichuan/
+mv Baichuan2-7B-Base.tar/zip /data/baichuan/
+```
+
+
+## Run model
+
+```bash
+python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/ --chat_template template_baichuan.jinja --trust-remote-code
+```
+
+## Run Baichuan w8a16 quantization
+
+### Retrieve int8 weights
+
+Int8 weights will be saved at /data/baichuan/Baichuan2-7B-Base/int8
+```bash
+python3 convert2int8.py --model-path /data/baichuan/Baichuan2-7B-Base/
+```
+
+### Run
+
+```bash
+python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --trust-remote-code --max-num-seqs 1 --max-model-len 256 \    
+                             --trust-remote-code --tensor-parallel-size 2 --temperature 0.0
+```
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py b/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py
new file mode 100644
index 0000000000000000000000000000000000000000..a244476159c5762f1b0fb5cceea86c50bd7c9066
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import os
+import sys
+from collections import OrderedDict
+import argparse
+import glob
+import shutil
+import json
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model-path",type=str,default=None)
+args = parser.parse_args()
+
+
+def float2int8(load_path, save_path):
+    all_files = glob.glob(os.path.join(load_path,'*'))
+    os.makedirs(save_path)
+    print(f"save int8 weight to: {save_path}")
+    for raw_file in all_files:
+        ext_name = os.path.splitext(raw_file)[-1]
+        if ext_name in ['.json', '.py', '.model']:
+            dst_file = os.path.split(raw_file)[-1]
+            dst_file = os.path.join(save_path, dst_file)
+            shutil.copy(raw_file, dst_file)
+            print(f"copy file `{raw_file}` to `{dst_file}`")
+        elif ext_name == ".bin":
+            print(f"quantize `{raw_file}`")
+            params = torch.load(raw_file,map_location="cpu")
+            new_params = OrderedDict()
+            keys = ['proj','pack']
+            for k,v in params.items():
+                find_key = False
+                for key in keys:
+                    if key in k:
+                        scale = torch.abs(v).max(dim=-1)[0] / 127.0
+                        int8_v = torch.clamp(v / scale.view(-1,1),min=-127,max=127).to(torch.int8).contiguous()
+                        scale = scale.view(1,-1).contiguous()
+                        new_params[k] = int8_v
+                        new_params[k.replace("weight","scales")] = scale
+                        find_key = True
+                        break
+                if find_key:
+                    continue
+                # save the other param
+                new_params[k] = v
+            file_name = os.path.basename(raw_file)
+            file_name_no_suffix = file_name.rsplit('.',1)[0]
+            new_file_name = file_name_no_suffix+"_int8.bin"
+            torch.save(new_params,os.path.join(save_path,new_file_name))
+
+    config_file = os.path.join(save_path, "w8a16_config.json")
+    with open(config_file, 'w') as f:
+        f.write(json.dumps({}))
+
+if __name__ == "__main__":
+    model_path = args.model_path
+    save_path = os.path.join(model_path, "int8")
+    if os.path.isdir(save_path):
+        shutil.rmtree(save_path)
+    float2int8(model_path, save_path)
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d01a7916f3e6e7f66e3dda6c963679e82b96085
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+from utils import build_chat,post_process,load_chat_template,sampling_add_cli_args
+
+import logging
+import time
+import argparse
+import dataclasses
+import inspect
+
+import torch
+from vllm import LLM, SamplingParams, EngineArgs
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--chat_template",type=str,default=None)
+parser.add_argument("--remove_chat_template",default=True,action="store_false",help="pass this if you are not use a chat model")
+parser = EngineArgs.add_cli_args(parser)
+parser = sampling_add_cli_args(parser)
+args = parser.parse_args()
+
+engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+sampling_args = [param.name for param in list(inspect.signature(SamplingParams.__init__).parameters.values())[1:]]
+engine_params = {attr:getattr(args, attr) for attr in engine_args}
+sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
+
+model_name = args.model.strip()
+model_name = model_name if args.model[-1]!='/' else model_name[:-1]
+model_name = model_name.rsplit('/')[-1]
+
+
+# Sample prompts.
+prompts = [
+            "哪些迹象可能表明一个人正在经历焦虑?", 
+            "描述一下如何制作芝士披萨。", 
+            "写一篇有关5G网络研发的综述文章。"
+           ]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(**sampling_params)
+
+# Create an LLM.
+llm = LLM(**engine_params)
+
+# process chat template
+if not args.remove_chat_template:
+    if 'chat' not in model_name.lower():
+        logging.warning(f"We assume that you are using the chat model, so additional processing is required for the input prompt. "
+                        f"If the result is not quite correct, please ensure that the model path includes the chat character. "
+                        f"for now, the model_name from model path is {model_name}")
+    prompts_new = prompts
+else:
+    # Build chat model promopt
+    # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+    # For some old models, the default template may cause bad answers. we don't consider this situation, 
+    # because the Transformers team is advancing the chat template. For more informatino about it, 
+    # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+    try:
+        load_chat_template(llm.get_tokenizer(),args.chat_template)
+        prompts_new = []
+        for prompt in prompts:
+            messages = [
+                {"role": "user", "content": prompt}
+            ]
+            text = llm.get_tokenizer().apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            prompts_new.append(text)
+    except:
+        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.37.0)")
+        # Fall back to simple build chat, this part should be controled by model developer, we just provide a simple use cases
+        prompts_new = [build_chat(llm.get_tokenizer(),prompt,model_name,max_length=args.max_generate_tokens) for prompt in prompts]
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
+torch.cuda.synchronize()
+
+start_time = time.perf_counter()
+outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
+torch.cuda.synchronize()
+end_time = time.perf_counter()
+duration_time = end_time - start_time
+
+num_tokens = 0
+# Print the outputs.
+for i, output in enumerate(outputs):
+    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
+    generated_text = post_process(output.outputs[0].text,model_name)
+    
+    num_tokens += len(output.outputs[0].token_ids)
+    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+
+# 0.3.2 tokens: 757, QPS: 97.97229589080902
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a1812a6c09ab127ffd7fbe60fb9617de90f292c7
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
@@ -0,0 +1,22 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+<reserved_106>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+<reserved_107>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+<reserved_107>
+{% endif %}
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc072d8d51e35109a97c17b5476e7bf3aa1448b
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from copy import deepcopy
+from typing import Tuple, List, Union
+
+import codecs
+import logging
+import argparse
+
+# 对于chat模型，或者模型需要特定的输入，需要对prompt进行额外的处理。
+# 如果您在使用中有额外的prompt处理方式需求或者错误反馈，可以联系王坚或者巩亚飞，我们会对modelzoo进行更新适配。
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+        if chat_template is not None:
+            try:
+                with open(chat_template, "r") as f:
+                    tokenizer.chat_template = f.read()
+            except OSError:
+                # If opening a file fails, set chat template to be args to
+                # ensure we decode so our escape are interpreted correctly
+                tokenizer.chat_template = codecs.decode(
+                    chat_template, "unicode_escape")
+
+            logging.info(
+                f"Using supplied chat template:\n{tokenizer.chat_template}"
+            )
+        elif tokenizer.chat_template is not None:
+            logging.info(
+                f"Using default chat template:\n{tokenizer.chat_template}"
+            )
+        else:
+            logging.warning(
+                "No chat template provided. Chat API will not work.")
+
+def default_build_chat(tokenizer,prompt):
+    return prompt
+
+def chatglm2_build_chat(tokenizer,prompt):
+    return tokenizer.build_prompt(prompt)
+
+def chatglm3_build_chat(tokenizer,prompt):
+    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
+
+def llama2_build_chat(tokenizer,prompt):
+    return f"[INST]{prompt}[/INST]"
+
+# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
+def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
+    def _parse_messages(messages, split_role="user"):
+        system, rounds = "", []
+        round = []
+        for i, message in enumerate(messages):
+            if message["role"] == "system":
+                assert i == 0
+                system = message["content"]
+                continue
+            if message["role"] == split_role and round:
+                rounds.append(round)
+                round = []
+            round.append(message)
+        if round:
+            rounds.append(round)
+        return system, rounds
+
+    messages = [{"role": "user", "content": f"{prompt}"}]
+    max_new_tokens = max_new_tokens
+    max_input_tokens = 4096 - max_new_tokens
+    system, rounds = _parse_messages(messages, split_role="user")
+    system_tokens = tokenizer.encode(system)
+    max_history_tokens = max_input_tokens - len(system_tokens)
+
+    history_tokens = []
+    for round in rounds[::-1]:
+        round_tokens = []
+        for message in round:
+            if message["role"] == "user":
+                round_tokens.append(195)
+            else:
+                round_tokens.append(196)
+            round_tokens.extend(tokenizer.encode(message["content"]))
+        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
+            history_tokens = round_tokens + history_tokens  # concat left
+            if len(history_tokens) < max_history_tokens:
+                continue
+        break
+
+    input_tokens = system_tokens + history_tokens
+    if messages[-1]["role"] != "assistant":
+        input_tokens.append(196)
+    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
+    return input_tokens
+
+def qwen_build_chat(
+    tokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set()
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            response_text, response_tokens_part = _tokenize_str(
+                "assistant", turn_response
+            )
+            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+            prev_chat = (
+                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+            )
+
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+
+    return raw_text, context_tokens
+
+def codellama_build_chat(tokenizer,prompt):
+    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
+
+def build_chat(tokenizer, prompt, model_name, **kwargs):
+    model_name = model_name.lower()
+        # return str or list[int]
+    if "chatglm2" in model_name:
+        prompt = chatglm2_build_chat(tokenizer,prompt)
+    elif "chatglm3" in model_name:
+        prompt = chatglm3_build_chat(tokenizer,prompt)
+    elif "llama2" in model_name and 'chat' in model_name:
+        prompt = llama2_build_chat(tokenizer,prompt)
+    elif "baichuan2" in model_name and 'chat' in model_name:
+        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
+    elif "qwen" in model_name and 'chat' in model_name:
+        prompt = qwen_build_chat(tokenizer,prompt)
+    elif "code" in model_name and 'llama' in model_name:
+        prompt = codellama_build_chat(tokenizer,prompt)
+    else:
+        prompt = default_build_chat(tokenizer,prompt)
+    return prompt
+
+
+# for output
+def default_post_process(output):
+    return output
+
+def glm2_post_process(output):
+    output = output.strip()
+    output = output.replace("[[训练时间]]", "2023年")
+    return output
+
+def glm3_post_process(output, history=[]):
+    content = ""
+    history = deepcopy(history)
+    for response in output.split("<|assistant|>"):
+        metadata, content = response.split("\n", maxsplit=1)
+        if not metadata.strip():
+            content = content.strip()
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            content = content.replace("[[训练时间]]", "2023年")
+        else:
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            if history[0]["role"] == "system" and "tools" in history[0]:
+                content = "\n".join(content.split("\n")[1:-1])
+                def tool_call(**kwargs):
+                    return kwargs
+                parameters = eval(content)
+                content = {"name": metadata.strip(), "parameters": parameters}
+            else:
+                content = {"name": metadata.strip(), "content": content}
+    return content
+
+def post_process(response, model_name,**kwargs):
+    model_name = model_name.lower()
+    if "chatglm2" in model_name:
+        response = glm2_post_process(response)
+    elif "chatglm3" in model_name:
+        response = glm3_post_process(response)
+    else:
+        response = default_post_process(response)
+    return response
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm/vllm/README.md b/models/nlp/large_language_model/chatglm/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..78c8d48c8b0866a9b835456828fb78217b4843fa
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm/vllm/README.md
@@ -0,0 +1,51 @@
+# ChatGLM3-6B
+
+## Description
+ChatGLM3-6B is trained on large-scale natural language text data, enabling it to understand and generate text. It can be applied to various natural language processing tasks such as dialogue generation, text summarization, and language translation. 
+
+## Setup
+
+### Install
+In order to run the model smoothly, we need the following dependency files:
+1. ixrt-xxx.whl
+2. ixformer-xxx.whl
+3. vllm-xxx.whl
+Please contact the staff to obtain the relevant installation packages.
+
+```bash
+yum install mesa-libGL
+pip3 install transformers==4.33.2
+pip3 install Path/To/ixrt-xxx.whl
+pip3 install Path/To/vllm-xxx.whl
+pip3 install Path/To/ixformer-xxx.whl
+```
+
+### Download
+Pretrained model: <https://huggingface.co/THUDM/chatglm3-6b>
+
+```bash
+mkdir /data/chatglm/
+mv chatglm3-6b.zip/tar /data/chatglm/
+```
+
+
+## Run model
+
+```bash
+python3 offline_inference.py --model /data/chatglm/chatglm3-6b --trust-remote-code --temperature 0.0 --max-tokens 256
+```
+
+## Use the server
+
+### Start the server
+
+```bash
+python3 -m vllm.entrypoints.openai.api_server --model /data/chatglm/chatglm3-6b --gpu-memory-utilization 0.9 --max-num-batched-tokens 8193 \
+        --max-num-seqs 32 --disable-log-requests --host 127.0.0.1 --port 12345 --trust-remote-code
+```
+
+### Test using the OpenAI interface
+
+```bash
+python3 server_inference.py --host 127.0.0.1 --port 12345 --model_path /data/chatglm/chatglm3-6b
+```
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..0162d93c53ac839268b3c964e0e96ecaad63ac4e
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm/vllm/offline_inference.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+from utils import load_chat_template,sampling_add_cli_args
+
+import logging
+import time
+import argparse
+import dataclasses
+import inspect
+
+import torch
+from vllm import LLM, SamplingParams, EngineArgs
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--chat_template",type=str,default=None)
+parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
+parser = EngineArgs.add_cli_args(parser)
+parser = sampling_add_cli_args(parser)
+args = parser.parse_args()
+
+engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+sampling_args = [param.name for param in list(inspect.signature(SamplingParams.__init__).parameters.values())[1:]]
+engine_params = {attr:getattr(args, attr) for attr in engine_args}
+sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
+
+model_name = args.model.strip()
+model_name = model_name if args.model[-1]!='/' else model_name[:-1]
+model_name = model_name.rsplit('/')[-1]
+
+
+# Sample prompts.
+prompts = [
+            "哪些迹象可能表明一个人正在经历焦虑?", 
+            "描述一下如何制作芝士披萨。", 
+            "写一篇有关5G网络研发的综述文章。"
+           ]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(**sampling_params)
+
+# Create an LLM.
+llm = LLM(**engine_params)
+
+# process chat template
+if args.remove_chat_template:
+    if 'chat' in model_name.lower():
+        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
+    prompts_new = prompts
+else:
+    # Build chat model promopt
+    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
+    logging.warning("For now, openai api chat interface(v1/chat/completions) need you provide a chat template to process prompt(str) for better results. "
+                    "Otherwise, you have to use the default chat template, which may lead to bad answers. But, the process of building chat input is complex "
+                    "for some models and the rule of process can not be written as a jinja file. Fortunately, the v1/completions interface support List[int] "
+                    "params. This means you can process the prompt firstly, then send the List[int] to v1/completions and consider it as v1/chat/completions "
+                    "to use when you use openai api.")
+    tokenizer = llm.get_tokenizer()
+    prompts_new = []
+    for prompt in prompts:
+        input_idx = tokenizer.build_chat_input(prompt)['input_ids'][0].cpu().tolist()
+        prompts_new.append(input_idx)
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
+torch.cuda.synchronize()
+
+start_time = time.perf_counter()
+outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
+torch.cuda.synchronize()
+end_time = time.perf_counter()
+duration_time = end_time - start_time
+
+num_tokens = 0
+# Print the outputs.
+for i, output in enumerate(outputs):
+    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
+    generated_text = output.outputs[0].text
+    
+    num_tokens += len(output.outputs[0].token_ids)
+    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+
+# 0.3.2 tokens: 422, QPS: 70.02308283048338(tokens: 422, QPS: 93.67210003677407),32-k 模型 tokens: 477, QPS: 81.46537314533865(tokens: 477, QPS: 106.54247895449554)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm/vllm/server_inference.py b/models/nlp/large_language_model/chatglm/vllm/server_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e60b6f9c25f71ce4d8be81e8d8eeb32712acc3f9
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm/vllm/server_inference.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+import time
+from openai import OpenAI
+from transformers import AutoTokenizer
+
+
+def send_request(
+    api_url: str,
+    prompt: str,
+    output_len: int,
+    stream: bool,
+) -> None:
+    client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key="EMPTY",
+        base_url=api_url,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+    
+    completion = client.completions.create(
+        model=model,
+        # messages=[{"role": "user", "content": prompt},],
+        prompt=prompt,
+        n=1,
+        stream=stream,
+        max_tokens=output_len, 
+        temperature=0.0
+    )
+    
+    if stream:
+        for each_com in completion:
+            print(each_com)
+    else:
+        print("++++++++++++++++++")
+        print(completion)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark the online serving throughput.")
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--stream", action="store_true")
+    parser.add_argument("--output_token", type=int, default=1024)
+    parser.add_argument("--model_path", type=str)
+
+    args = parser.parse_args()
+    api_url = f"http://{args.host}:{args.port}/v1"
+    
+    prompts = [
+            "你好",
+            "Which city is the capital of China?",
+            "1 + 1 = ?",
+            "中国的首都是哪里", 
+            "请讲以下内容翻译为英文：\n你好,我来自中国。",
+            ]
+    
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+    prompts = [tokenizer.build_chat_input(i).input_ids.tolist() for i in prompts]
+
+    for prompt in prompts:
+        send_request(api_url,prompt,args.output_token,args.stream)
diff --git a/models/nlp/large_language_model/chatglm/vllm/utils.py b/models/nlp/large_language_model/chatglm/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc072d8d51e35109a97c17b5476e7bf3aa1448b
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm/vllm/utils.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from copy import deepcopy
+from typing import Tuple, List, Union
+
+import codecs
+import logging
+import argparse
+
+# 对于chat模型，或者模型需要特定的输入，需要对prompt进行额外的处理。
+# 如果您在使用中有额外的prompt处理方式需求或者错误反馈，可以联系王坚或者巩亚飞，我们会对modelzoo进行更新适配。
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+        if chat_template is not None:
+            try:
+                with open(chat_template, "r") as f:
+                    tokenizer.chat_template = f.read()
+            except OSError:
+                # If opening a file fails, set chat template to be args to
+                # ensure we decode so our escape are interpreted correctly
+                tokenizer.chat_template = codecs.decode(
+                    chat_template, "unicode_escape")
+
+            logging.info(
+                f"Using supplied chat template:\n{tokenizer.chat_template}"
+            )
+        elif tokenizer.chat_template is not None:
+            logging.info(
+                f"Using default chat template:\n{tokenizer.chat_template}"
+            )
+        else:
+            logging.warning(
+                "No chat template provided. Chat API will not work.")
+
+def default_build_chat(tokenizer,prompt):
+    return prompt
+
+def chatglm2_build_chat(tokenizer,prompt):
+    return tokenizer.build_prompt(prompt)
+
+def chatglm3_build_chat(tokenizer,prompt):
+    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
+
+def llama2_build_chat(tokenizer,prompt):
+    return f"[INST]{prompt}[/INST]"
+
+# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
+def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
+    def _parse_messages(messages, split_role="user"):
+        system, rounds = "", []
+        round = []
+        for i, message in enumerate(messages):
+            if message["role"] == "system":
+                assert i == 0
+                system = message["content"]
+                continue
+            if message["role"] == split_role and round:
+                rounds.append(round)
+                round = []
+            round.append(message)
+        if round:
+            rounds.append(round)
+        return system, rounds
+
+    messages = [{"role": "user", "content": f"{prompt}"}]
+    max_new_tokens = max_new_tokens
+    max_input_tokens = 4096 - max_new_tokens
+    system, rounds = _parse_messages(messages, split_role="user")
+    system_tokens = tokenizer.encode(system)
+    max_history_tokens = max_input_tokens - len(system_tokens)
+
+    history_tokens = []
+    for round in rounds[::-1]:
+        round_tokens = []
+        for message in round:
+            if message["role"] == "user":
+                round_tokens.append(195)
+            else:
+                round_tokens.append(196)
+            round_tokens.extend(tokenizer.encode(message["content"]))
+        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
+            history_tokens = round_tokens + history_tokens  # concat left
+            if len(history_tokens) < max_history_tokens:
+                continue
+        break
+
+    input_tokens = system_tokens + history_tokens
+    if messages[-1]["role"] != "assistant":
+        input_tokens.append(196)
+    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
+    return input_tokens
+
+def qwen_build_chat(
+    tokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set()
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            response_text, response_tokens_part = _tokenize_str(
+                "assistant", turn_response
+            )
+            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+            prev_chat = (
+                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+            )
+
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+
+    return raw_text, context_tokens
+
+def codellama_build_chat(tokenizer,prompt):
+    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
+
+def build_chat(tokenizer, prompt, model_name, **kwargs):
+    model_name = model_name.lower()
+        # return str or list[int]
+    if "chatglm2" in model_name:
+        prompt = chatglm2_build_chat(tokenizer,prompt)
+    elif "chatglm3" in model_name:
+        prompt = chatglm3_build_chat(tokenizer,prompt)
+    elif "llama2" in model_name and 'chat' in model_name:
+        prompt = llama2_build_chat(tokenizer,prompt)
+    elif "baichuan2" in model_name and 'chat' in model_name:
+        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
+    elif "qwen" in model_name and 'chat' in model_name:
+        prompt = qwen_build_chat(tokenizer,prompt)
+    elif "code" in model_name and 'llama' in model_name:
+        prompt = codellama_build_chat(tokenizer,prompt)
+    else:
+        prompt = default_build_chat(tokenizer,prompt)
+    return prompt
+
+
+# for output
+def default_post_process(output):
+    return output
+
+def glm2_post_process(output):
+    output = output.strip()
+    output = output.replace("[[训练时间]]", "2023年")
+    return output
+
+def glm3_post_process(output, history=[]):
+    content = ""
+    history = deepcopy(history)
+    for response in output.split("<|assistant|>"):
+        metadata, content = response.split("\n", maxsplit=1)
+        if not metadata.strip():
+            content = content.strip()
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            content = content.replace("[[训练时间]]", "2023年")
+        else:
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            if history[0]["role"] == "system" and "tools" in history[0]:
+                content = "\n".join(content.split("\n")[1:-1])
+                def tool_call(**kwargs):
+                    return kwargs
+                parameters = eval(content)
+                content = {"name": metadata.strip(), "parameters": parameters}
+            else:
+                content = {"name": metadata.strip(), "content": content}
+    return content
+
+def post_process(response, model_name,**kwargs):
+    model_name = model_name.lower()
+    if "chatglm2" in model_name:
+        response = glm2_post_process(response)
+    elif "chatglm3" in model_name:
+        response = glm3_post_process(response)
+    else:
+        response = default_post_process(response)
+    return response
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/README.md b/models/nlp/large_language_model/llama2-70b/trtllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..26e5c59a764a02a02bfc0f028f700564b881e714
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/README.md
@@ -0,0 +1,43 @@
+# LlaMa2 70B
+
+## Description
+we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.
+
+## Setup
+
+### Install
+```bash
+yum install mesa-libGL
+
+bash scripts/set_environment.sh .
+# Please contact the staff to obtain the relevant installlation packages.
+pip3 install Path/To/ixrt-xxx.whl
+pip3 install Path/To/tensorrt_llm-xxx.whl
+pip3 install Path/To/ixformer-xxx.whl
+```
+
+### Download
+-Model: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+
+-Dataset:https://huggingface.co/datasets/cnn_dailymail
+
+```bash
+# Download model from the website and make sure the model's path is "data/llama2-70b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir data
+
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+wget --no-check-certificate https://raw.githubusercontent.com/huggingface/evaluate/main/metrics/rouge/rouge.py -P rouge
+```
+
+## Inference
+### FP16
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+# Build engine
+bash scripts/test_trtllm_llama2_70b_gpu8_build.sh
+# Inference
+bash scripts/test_trtllm_llama2_70b_gpu8.sh
+```
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/build.py b/models/nlp/large_language_model/llama2-70b/trtllm/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ff0c9eaa0cedfd382783a5cfcca9175bf38acad
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/build.py
@@ -0,0 +1,1163 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import math
+import os
+import sys
+import time
+from pathlib import Path
+
+# isort: off
+import torch
+import torch.multiprocessing as mp
+import tensorrt as trt
+# isort: on
+from transformers import LlamaConfig, LlamaForCausalLM
+
+try:
+    from transformers import MixtralForCausalLM
+except ImportError:
+    MixtralForCausalLM = None
+
+try:
+    from transformers import LlavaConfig, LlavaForConditionalGeneration
+except ImportError:
+    pass
+
+import tensorrt_llm
+from tensorrt_llm import profiler
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm._utils import str_dtype_to_trt
+from tensorrt_llm.builder import Builder
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.layers.attention import PositionEmbeddingType
+from tensorrt_llm.logger import logger
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import quantize_model
+from tensorrt_llm.network import net_guard
+from tensorrt_llm.plugin.plugin import ContextFMHAType
+from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.runtime.lora_manager import LoraConfig
+
+from tensorrt_llm.models.llama.weight import (  # isort:skip
+    get_scaling_factors, load_from_awq_llama, load_from_binary,
+    load_from_gptq_llama, load_from_hf_checkpoint, load_from_hf_llama,
+    load_from_meta_llama, parse_bin_config)
+
+MODEL_NAME = "llama"
+
+# 2 routines: get_engine_name, serialize_engine
+# are direct copy from gpt example, TODO: put in utils?
+
+import onnx
+from onnx import TensorProto, helper
+
+
+def trt_dtype_to_onnx(dtype):
+    if dtype == trt.float16:
+        return TensorProto.DataType.FLOAT16
+    if dtype == trt.bfloat16:
+        return TensorProto.DataType.BFLOAT16
+    elif dtype == trt.float32:
+        return TensorProto.DataType.FLOAT
+    elif dtype == trt.int32:
+        return TensorProto.DataType.INT32
+    elif dtype == trt.int64:
+        return TensorProto.DataType.INT64
+    elif dtype == trt.bool:
+        return TensorProto.DataType.BOOL
+    else:
+        raise TypeError("%s is not supported" % dtype)
+
+
+def to_onnx(network, path):
+    inputs = []
+    for i in range(network.num_inputs):
+        network_input = network.get_input(i)
+        inputs.append(
+            helper.make_tensor_value_info(
+                network_input.name, trt_dtype_to_onnx(network_input.dtype),
+                list(network_input.shape)))
+
+    outputs = []
+    for i in range(network.num_outputs):
+        network_output = network.get_output(i)
+        outputs.append(
+            helper.make_tensor_value_info(
+                network_output.name, trt_dtype_to_onnx(network_output.dtype),
+                list(network_output.shape)))
+
+    nodes = []
+    for i in range(network.num_layers):
+        layer = network.get_layer(i)
+        layer_inputs = []
+        for j in range(layer.num_inputs):
+            ipt = layer.get_input(j)
+            if ipt is not None:
+                layer_inputs.append(layer.get_input(j).name)
+        layer_outputs = [
+            layer.get_output(j).name for j in range(layer.num_outputs)
+        ]
+        nodes.append(
+            helper.make_node(str(layer.type),
+                             name=layer.name,
+                             inputs=layer_inputs,
+                             outputs=layer_outputs,
+                             domain="com.nvidia"))
+
+    onnx_model = helper.make_model(helper.make_graph(nodes,
+                                                     'attention',
+                                                     inputs,
+                                                     outputs,
+                                                     initializer=None),
+                                   producer_name='NVIDIA')
+    onnx.save(onnx_model, path)
+
+
+def get_engine_name(model, dtype, tp_size, pp_size, rank):
+    if pp_size == 1:
+        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size,
+                                                  pp_size, rank)
+
+
+def serialize_engine(engine, path):
+    logger.info(f'Serializing engine to {path}...')
+    tik = time.time()
+    with open(path, 'wb') as f:
+        f.write(engine)
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Engine serialized. Total time: {t}')
+
+
+def parse_arguments(cmd_args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--world_size', type=int, default=1)
+    parser.add_argument('--tp_size', type=int, default=1)
+    parser.add_argument('--pp_size', type=int, default=1)
+    parser.add_argument('--model_dir', type=str, default=None)
+    parser.add_argument('--bin_model_dir', type=str, default=None)
+    parser.add_argument('--meta_ckpt_dir', type=str, default=None)
+    parser.add_argument('--quant_ckpt_path', type=str, default=None)
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float16',
+                        choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument(
+        '--timing_cache',
+        type=str,
+        default='model.cache',
+        help=
+        'The path of to read timing cache from, will be ignored if the file does not exist'
+    )
+    parser.add_argument(
+        '--profiling_verbosity',
+        type=str,
+        default='layer_names_only',
+        choices=['layer_names_only', 'detailed', 'none'],
+        help=
+        'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.'
+    )
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument('--vocab_size', type=int, default=32000)
+    parser.add_argument('--n_layer', type=int, default=32)
+    parser.add_argument('--n_positions', type=int, default=2048)
+    parser.add_argument('--n_embd', type=int, default=4096)
+    parser.add_argument('--n_head', type=int, default=32)
+    parser.add_argument('--n_kv_head', type=int, default=None)
+    parser.add_argument('--multiple_of', type=int, default=256)
+    parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0)
+    parser.add_argument('--inter_size', type=int, default=None)
+    parser.add_argument('--hidden_act', type=str, default='silu')
+    parser.add_argument('--rms_norm_eps', type=float, default=1e-06)
+    parser.add_argument('--max_batch_size', type=int, default=8)
+    parser.add_argument('--max_input_len', type=int, default=2048)
+    parser.add_argument('--max_output_len', type=int, default=512)
+    parser.add_argument('--max_beam_width', type=int, default=1)
+    parser.add_argument('--rotary_base', type=float, default=10000.0)
+    parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None)
+    parser.add_argument('--use_gpt_attention_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_gemm_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_rmsnorm_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument('--use_lookup_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_gather_last_token_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument('--use_activation_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument('--use_elementwise_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument("--use_cast_plugin", action="store_true")
+
+    parser.add_argument('--parallel_build', default=False, action='store_true')
+    parser.add_argument('--enable_context_fmha',
+                        default=False,
+                        action='store_true')
+    parser.add_argument('--enable_context_fmha_fp32_acc',
+                        default=False,
+                        action='store_true')
+    parser.add_argument(
+        '--use_paged_context_fmha',
+        action='store_true',
+        help=
+        'Activates paged context FMHA. This mode of the context FMHA is required for chunked context, speculative decoding and reuse of KV cache blocks. Context FMHA performance is worse when this mode is on.'
+    )
+    parser.add_argument(
+        '--multi_block_mode',
+        default=False,
+        action='store_true',
+        help=
+        'Split long kv sequence into multiple blocks (applied to generation MHA kernels). \
+                        It is beneficial when batch x num_heads cannot fully utilize GPU.'
+    )
+    parser.add_argument(
+        '--disable_xqa',
+        default=False,
+        action='store_true',
+        help=
+        'Disable XQA optimization for the generation MHA. See more details in docs/gpt_attention.'
+    )
+    parser.add_argument('--visualize', default=False, action='store_true')
+    parser.add_argument('--load_by_shard',
+                        action='store_true',
+                        help='Load a pretrained model shard-by-shard.')
+    parser.add_argument('--enable_debug_output',
+                        default=False,
+                        action='store_true')
+    parser.add_argument('--gpus_per_node', type=int, default=8)
+    parser.add_argument('--builder_opt', type=int, default=None)
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='engine_outputs',
+        help=
+        'The path to save the serialized engine files, timing cache file and model configs'
+    )
+    parser.add_argument('--remove_input_padding',
+                        default=False,
+                        action='store_true')
+    parser.add_argument(
+        '--use_fused_mlp',
+        default=False,
+        action='store_true',
+        help=
+        'Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance. '
+        'For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded '
+        '(0.45734 vs 0.45755 for LLaMA-v2 7B using ammo/examples/hf/instruct_eval/mmlu.py).'
+    )
+    parser.add_argument('--enable_pos_shift',
+                        default=False,
+                        action='store_true',
+                        help='Enable position shift for streamingllm method')
+    parser.add_argument(
+        '--dense_context_fmha',
+        default=False,
+        action='store_true',
+        help=
+        'Enable dense fmha in context phase, otherwise sliding window attention.'
+        'If dense_context_fmha=False, the sliding window size is the max attention window size.'
+    )
+    # Arguments related to the quantization of the model.
+    parser.add_argument(
+        '--use_smooth_quant',
+        default=False,
+        action="store_true",
+        help=
+        'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.'
+        'See --per_channel and --per_token for finer-grained quantization options.'
+    )
+    parser.add_argument(
+        '--per_channel',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor for the GEMM\'s result. '
+        'per_channel instead uses a different static scaling factor for each channel. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_token',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale activations in the int8 range. '
+        'per_token chooses at run time, and for each token, a custom scaling factor. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_group',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale weights in the int4 range. '
+        'per_group chooses at run time, and for each group, a custom scaling factor. '
+        'The flag is built for GPTQ/AWQ quantization.')
+    parser.add_argument('--group_size',
+                        type=int,
+                        default=128,
+                        help='Group size used in GPTQ/AWQ quantization.')
+    parser.add_argument(
+        '--int8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=1,  # Meta does TP on hidden dim
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
+    )
+    parser.add_argument(
+        '--enable_fp8',
+        default=False,
+        action='store_true',
+        help='Use FP8 Linear layer for Attention QKV/Dense and MLP.')
+    parser.add_argument(
+        '--fp8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--quantized_fp8_model_path',
+        type=str,
+        default=None,
+        help='Path of a quantized model checkpoint in .npz format')
+    parser.add_argument(
+        '--use_weight_only',
+        default=False,
+        action="store_true",
+        help='Quantize weights for the various GEMMs to INT4/INT8.'
+        'See --weight_only_precision to set the precision')
+    parser.add_argument(
+        '--disable_weight_only_quant_plugin',
+        default=False,
+        action="store_true",
+        help=
+        'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--weight_only_precision',
+        const='int8',
+        type=str,
+        nargs='?',
+        default='int8',
+        choices=['int8', 'int4', 'int4_awq', 'int4_gptq'],
+        help=
+        'Define the precision for the weights when using weight-only quantization.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--quantize_lm_head',
+        default=False,
+        action="store_true",
+        help='Quantize lm_head weights as well when using int4_awq.')
+    parser.add_argument(
+        '--use_inflight_batching',
+        action="store_true",
+        default=False,
+        help="Activates inflight batching mode of gptAttentionPlugin.")
+    parser.add_argument(
+        '--paged_kv_cache',
+        action="store_true",
+        default=False,
+        help=
+        'By default we use contiguous KV cache. By setting this flag you enable paged KV cache'
+    )
+    parser.add_argument('--tokens_per_block',
+                        type=int,
+                        default=128,
+                        help='Number of tokens per block in paged KV cache')
+    parser.add_argument(
+        '--max_num_tokens',
+        type=int,
+        default=None,
+        help=
+        'Define the max number of tokens supported by the engine, note that it takes no effect if --remove_input_padding is not set'
+    )
+    parser.add_argument(
+        '--strongly_typed',
+        default=False,
+        action="store_true",
+        help=
+        'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
+    )
+    parser.add_argument(
+        '--use_custom_all_reduce',
+        action='store_true',
+        help=
+        'Activates latency-optimized algorithm for all-reduce instead of NCCL.')
+    parser.add_argument(
+        '--max_prompt_embedding_table_size',
+        type=int,
+        default=0,
+        help='Setting to a value > 0 enables support for prompt tuning.')
+    parser.add_argument(
+        '--gather_all_token_logits',
+        action='store_true',
+        default=False,
+        help='Enable both gather_context_logits and gather_generation_logits')
+    parser.add_argument('--gather_context_logits',
+                        action='store_true',
+                        default=False,
+                        help='Gather context logits')
+    parser.add_argument('--gather_generation_logits',
+                        action='store_true',
+                        default=False,
+                        help='Gather generation logits')
+    parser.add_argument(
+        '--use_lora_plugin',
+        nargs='?',
+        const=None,
+        default=False,
+        choices=['float16', 'float32', 'bfloat16'],
+        help="Activates the lora plugin which enables embedding sharing.")
+    parser.add_argument(
+        '--lora_target_modules',
+        nargs='+',
+        default=None,
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
+        help=
+        "Add lora in which modules. Only be activated when use_lora_plugin is enabled."
+    )
+    parser.add_argument('--hf_lora_dir', type=str, default=None)
+    parser.add_argument(
+        '--max_lora_rank',
+        type=int,
+        default=64,
+        help='maximum lora rank for different lora modules. '
+        'It is used to compute the workspace size of lora plugin.')
+    parser.add_argument(
+        '--moe_num_experts',
+        default=0,
+        type=int,
+        help='Specify the number of experts to use for MOE layers')
+    parser.add_argument(
+        '--moe_top_k',
+        default=0,
+        type=int,
+        help=
+        'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set'
+    )
+    parser.add_argument(
+        '--moe_tp_mode',
+        default=MoeConfig.ParallelismMode.TENSOR_PARALLEL,
+        type=int,
+        help=
+        'Controls how to distribute experts in TP. Check layers/moe.py for accepted values',
+    )
+    parser.add_argument(
+        '--moe_renorm_mode',
+        default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+        type=int,
+        help=
+        'Controls renormalization after gate logits. Check layers/moe.py for accepted values',
+    )
+    parser.add_argument("--total_build_time_target", type=float, default=0)
+
+    args = parser.parse_args(cmd_args)
+    logger.set_level(args.log_level)
+    
+    assert args.total_build_time_target >= 0, "total_build_time_target must bigger than 0"
+    
+    assert not (
+        args.use_smooth_quant and args.use_weight_only
+    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
+
+    if not args.remove_input_padding:
+        if args.use_gpt_attention_plugin:
+            logger.warning(
+                f"It is recommended to specify --remove_input_padding when using GPT attention plugin"
+            )
+
+    if args.use_inflight_batching:
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
+
+    if args.use_smooth_quant:
+        args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
+                                                     args.per_channel)
+    elif args.use_weight_only:
+        args.quant_mode = QuantMode.from_description(
+            quantize_weights=True,
+            quantize_activations=False,
+            per_token=False,
+            per_channel=False,
+            per_group=args.per_group,
+            use_int4_weights="int4" in args.weight_only_precision)
+    else:
+        args.quant_mode = QuantMode(0)
+
+    if args.int8_kv_cache:
+        args.quant_mode = args.quant_mode.set_int8_kv_cache()
+    elif args.fp8_kv_cache:
+        args.quant_mode = args.quant_mode.set_fp8_kv_cache()
+    if args.enable_fp8:
+        args.quant_mode = args.quant_mode.set_fp8_qdq()
+
+    if args.rotary_scaling is not None:
+        assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin."
+        rotary_scaling = {
+            "type": args.rotary_scaling[0],
+            "factor": float(args.rotary_scaling[1])
+        }
+        assert rotary_scaling["type"] in ["linear", "dynamic"]
+        assert rotary_scaling["factor"] > 1.0
+        args.rotary_scaling = rotary_scaling
+
+    if args.model_dir is not None:
+        hf_config = LlamaConfig.from_pretrained(args.model_dir)
+        if hf_config.model_type == "llava":
+            # LLaVA = Vision model + Llama LLM
+            # We load a llava config and use its' text config as llama config
+            hf_config = LlavaConfig.from_pretrained(args.model_dir).text_config
+            hf_config.model_type = "llava"  # Replace llama with llava
+
+        args.inter_size = hf_config.intermediate_size  # override the inter_size for LLaMA
+        args.n_embd = hf_config.hidden_size
+        args.n_head = hf_config.num_attention_heads
+        if hasattr(hf_config, "num_key_value_heads"):
+            args.n_kv_head = hf_config.num_key_value_heads
+        
+        # hf_config.num_hidden_layers = 1  # only for debug
+        args.n_layer = hf_config.num_hidden_layers
+        args.n_positions = hf_config.max_position_embeddings
+        args.vocab_size = hf_config.vocab_size if hf_config.vocab_size is not None else args.vocab_size
+        args.hidden_act = hf_config.hidden_act
+        args.rms_norm_eps = hf_config.rms_norm_eps
+        # These attributes only exists with Mixtral, for the moment
+        args.moe_num_experts = getattr(hf_config, "num_local_experts",
+                                       args.moe_num_experts)
+        args.moe_top_k = getattr(hf_config, "num_experts_per_tok",
+                                 args.moe_top_k)
+        args.rotary_base = getattr(hf_config, "rope_theta", args.rotary_base)
+        args.model_type = hf_config.model_type
+        if hf_config.model_type == "mixtral":
+            # HF LLaMA-type models are implicitly using gated activation.
+            # With our MoE implementation, we must make it explicit
+            args.hidden_act = "swiglu"
+
+    elif args.meta_ckpt_dir is not None:
+        with open(Path(args.meta_ckpt_dir, "params.json")) as fp:
+            meta_config: dict = json.load(fp)
+        args.n_embd = meta_config["dim"]
+        args.n_head = meta_config["n_heads"]
+        args.n_layer = meta_config["n_layers"]
+        args.n_kv_head = meta_config.get("n_kv_heads", args.n_head)
+        if "hidden_dim" in meta_config:
+            args.inter_size = meta_config["hidden_dim"]
+        else:
+            args.multiple_of = meta_config.get("multiple_of", 1)
+            n_embd = int(4 * args.n_embd * 2 / 3)
+            args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1)
+            args.inter_size = args.multiple_of * (
+                (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1)
+                // args.multiple_of)
+        args.rms_norm_eps = meta_config["norm_eps"]
+        args.moe_num_experts = meta_config.get("moe", {}).get("num_experts", 0)
+        args.moe_top_k = meta_config.get("moe", {}).get("num_experts_per_tok",
+                                                        0)
+    elif args.bin_model_dir is not None:
+        n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head = parse_bin_config(
+            Path(args.bin_model_dir) / "config.ini")
+        args.inter_size = inter_size  # override the inter_size for LLaMA
+        args.n_kv_head = n_kv_head
+        args.n_embd = n_embd
+        args.n_head = n_head
+        args.n_layer = n_layer
+        args.n_positions = n_positions
+        args.vocab_size = vocab_size if args.vocab_size is None else args.vocab_size
+        args.hidden_act = hidden_act
+        args.rms_norm_eps = 1e-06
+        logger.warning("Set rms_norm_eps to 1e-06 directly.")
+    if args.n_kv_head is None:
+        args.n_kv_head = args.n_head
+    elif args.n_kv_head != args.n_head:
+        assert (args.n_head % args.n_kv_head) == 0, \
+            "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
+        assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \
+            "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \
+            "the tensor parallelism size to be divisible by the number of K/V heads."
+
+    hf_modules_to_trtllm_modules = {
+        "q_proj": "attn_q",
+        "k_proj": "attn_k",
+        "v_proj": "attn_v",
+        "o_proj": "attn_dense",
+        "gate_proj": "mlp_h_to_4h",
+        "down_proj": "mlp_4h_to_h",
+        "up_proj": "mlp_gate"
+    }  # lora modules on llama
+
+    trtllm_modules_to_hf_modules = {
+        "attn_q": "q_proj",
+        "attn_k": "k_proj",
+        "attn_v": "v_proj",
+        "attn_dense": "o_proj",
+        "mlp_h_to_4h": "gate_proj",
+        "mlp_4h_to_h": "down_proj",
+        "mlp_gate": "up_proj",
+    }
+
+    lora_config = LoraConfig.from_hf(args.hf_lora_dir,
+                                     hf_modules_to_trtllm_modules,
+                                     trtllm_modules_to_hf_modules)
+
+    if lora_config.is_valid:
+        if args.lora_target_modules is None:
+            args.lora_target_modules = lora_config.lora_target_modules
+        # the lora checkpoint might finetune the embedding
+        if lora_config.vocab_size != 0:
+            args.vocab_size = lora_config.vocab_size
+
+    args.lora_config = lora_config
+
+    if args.weight_only_precision == 'int4_awq':
+        inter_alignment = args.tp_size * 128
+        if args.inter_size % inter_alignment != 0:
+            args.inter_size = int((args.inter_size + inter_alignment - 1) /
+                                  inter_alignment) * inter_alignment
+            logger.info("To use awq we pad intermediate_size to {}.".format(
+                args.inter_size))
+
+        if args.quantize_lm_head:
+            vocab_alignment = args.tp_size * 64
+            if args.vocab_size % vocab_alignment != 0:
+                args.vocab_size = int((args.vocab_size + vocab_alignment - 1) /
+                                      vocab_alignment) * vocab_alignment
+                logger.info("To use awq we pad vocab_size to {}.".format(
+                    args.vocab_size))
+
+    assert args.pp_size * args.tp_size == args.world_size
+
+    args.max_num_tokens = check_max_num_tokens(
+        max_num_tokens=args.max_num_tokens,
+        max_batch_size=args.max_batch_size,
+        max_input_len=args.max_input_len,
+        remove_input_padding=args.remove_input_padding)
+
+    assert (math.log2(args.tokens_per_block).is_integer()
+            ), "tokens_per_block must be power of 2"
+    if args.enable_context_fmha or args.enable_context_fmha_fp32_acc:
+        assert (args.tokens_per_block >=
+                128), "Context fMHA requires >= 128 tokens per block"
+
+    if args.inter_size is None:
+        # this should not be need when loading a real model
+        # but it is helpful when creating a dummy model without loading any real weights
+        n_embd = int(4 * args.n_embd * 2 / 3)
+        args.inter_size = args.multiple_of * (
+            (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) //
+            args.multiple_of)
+        logger.info(f"Setting inter_size to {args.inter_size}.")
+
+    if args.enable_pos_shift:
+        assert args.use_gpt_attention_plugin, "Position shift is only support in the gpt attention plugin."
+        assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc
+
+    if args.moe_num_experts and args.moe_top_k == 0:
+        args.moe_top_k = 1
+    args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k,
+                                args.moe_tp_mode,
+                                args.moe_renorm_mode).validate()
+
+    if args.gather_all_token_logits:
+        args.gather_context_logits = True
+        args.gather_generation_logits = True
+
+    return args
+
+
+def get_model_object(args, mapping, trt_dtype=None):
+    if trt_dtype is None:
+        trt_dtype = str_dtype_to_trt(args.dtype)
+    # Initialize Module
+    logger.debug("[Python]llama exampels, Initialize tensorrt_llm.models.LLaMAForCausalLM....")
+    tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(
+        num_layers=args.n_layer,
+        num_heads=args.n_head,
+        num_kv_heads=args.n_kv_head,
+        hidden_size=args.n_embd,
+        vocab_size=args.vocab_size,
+        hidden_act=args.hidden_act,
+        max_position_embeddings=args.n_positions,
+        dtype=trt_dtype,
+        mlp_hidden_size=args.inter_size,
+        position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+        mapping=mapping,
+        rotary_base=args.rotary_base,
+        rotary_scaling=args.rotary_scaling,
+        use_parallel_embedding=args.use_parallel_embedding,
+        embedding_sharding_dim=args.embedding_sharding_dim,
+        quant_mode=args.quant_mode,
+        rms_norm_eps=args.rms_norm_eps,
+        use_fused_mlp=args.use_fused_mlp,
+        use_prompt_tuning=args.max_prompt_embedding_table_size > 0,
+        enable_pos_shift=args.enable_pos_shift,
+        dense_context_fmha=args.dense_context_fmha,
+        moe_config=args.moe_config,
+        max_lora_rank=args.max_lora_rank)
+    quantize_kwargs = {}
+    if args.use_smooth_quant or args.use_weight_only:
+        if args.weight_only_precision == 'int4_awq':
+            exclude_modules = ['lm_head'] if not args.quantize_lm_head else []
+            quantize_kwargs = {
+                "group_size": args.group_size,
+                "zero": False,
+                "pre_quant_scale": True,
+                "exclude_modules": exclude_modules,
+            }
+        elif args.weight_only_precision == 'int4_gptq':
+            quantize_kwargs = {
+                "group_size": args.group_size,
+                "zero": True,
+                "pre_quant_scale": False,
+            }
+    elif args.enable_fp8 or args.fp8_kv_cache:
+        logger.info(f'Loading scaling factors from '
+                    f'{args.quantized_fp8_model_path}')
+        quant_scales = get_scaling_factors(args.quantized_fp8_model_path,
+                                           num_layers=args.n_layer,
+                                           quant_mode=args.quant_mode)
+        quantize_kwargs = {"quant_scales": quant_scales}
+
+    if args.use_weight_only and args.moe_config.has_moe():
+        if 'exclude_modules' in quantize_kwargs:
+            quantize_kwargs['exclude_modules'].append('router')
+        else:
+            quantize_kwargs['exclude_modules'] = ['lm_head', 'router']
+
+    tensorrt_llm_llama = quantize_model(tensorrt_llm_llama, args.quant_mode,
+                                        **quantize_kwargs)
+    if args.per_group:
+        if args.weight_only_precision == 'int4_awq':
+            load_from_awq_llama(tensorrt_llm_llama=tensorrt_llm_llama,
+                                quant_ckpt_path=args.quant_ckpt_path,
+                                quantize_lm_head=args.quantize_lm_head,
+                                mapping=mapping,
+                                dtype=args.dtype,
+                                bin_model_dir=args.bin_model_dir)
+        else:
+            load_from_gptq_llama(tensorrt_llm_llama=tensorrt_llm_llama,
+                                 quant_ckpt_path=args.quant_ckpt_path,
+                                 mapping=mapping,
+                                 dtype=args.dtype,
+                                 bin_model_dir=args.bin_model_dir)
+    elif args.meta_ckpt_dir is not None:
+        load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping,
+                             args.dtype)
+    elif args.model_dir is not None:
+        logger.info(f'Loading HF LLaMA ... from {args.model_dir}')
+        tik = time.time()
+        if not args.load_by_shard:
+            if args.model_type == "llava":
+                hf_llava = LlavaForConditionalGeneration.from_pretrained(
+                    args.model_dir, torch_dtype="auto")
+                hf_llama = hf_llava.language_model
+            else:
+                hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM
+                hf_llama = hf_model.from_pretrained(
+                    args.model_dir,
+                    device_map={
+                        "model": "cpu",
+                        "lm_head": "cpu",
+                        "embed_tokens": "cpu",
+                        "layers": "cpu",
+                        "norm": "cpu",
+                    },  # Load to CPU memory
+                    torch_dtype='auto',
+                )
+            use_gemm_woq_plugin = not args.disable_weight_only_quant_plugin
+            # hf_llama.config.num_hidden_layers = 1  # only for debug
+            load_from_hf_llama(tensorrt_llm_llama,
+                               hf_llama,
+                               mapping=mapping,
+                               dtype=args.dtype,
+                               use_gemm_woq_plugin=use_gemm_woq_plugin,
+                               lora_config=args.lora_config)
+            del hf_llama
+        else:
+            load_from_hf_checkpoint(tensorrt_llm_llama,
+                                    args.model_dir,
+                                    mapping,
+                                    dtype=args.dtype,
+                                    lora_config=args.lora_config)
+        tok = time.time()
+        t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+        logger.info(f'HF LLaMA loaded. Total time: {t}')
+
+    elif args.bin_model_dir is not None:
+        load_from_binary(tensorrt_llm_llama,
+                         args.bin_model_dir,
+                         mapping,
+                         fp16=(args.dtype == 'float16'),
+                         multi_query_mode=(args.n_kv_head != args.n_head))
+
+    return tensorrt_llm_llama
+
+
+def update_plugin_configs(args, network):
+    if args.use_gpt_attention_plugin:
+        network.plugin_config.set_gpt_attention_plugin(
+            dtype=args.use_gpt_attention_plugin)
+    if args.use_gemm_plugin:
+        if not args.enable_fp8:
+            network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
+        else:
+            logger.info(
+                "Gemm plugin does not support FP8. Disabled Gemm plugin.")
+    if args.use_rmsnorm_plugin:
+        network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin)
+    if args.use_lora_plugin:
+        network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin)
+    if args.use_lookup_plugin:
+        network.plugin_config.set_lookup_plugin(dtype=args.use_lookup_plugin)
+    if args.use_gather_last_token_plugin:
+        network.plugin_config.set_gather_last_token_plugin(dtype=args.use_gather_last_token_plugin)
+    if args.use_activation_plugin:
+        network.plugin_config.set_activation_plugin(dtype=args.use_activation_plugin)
+    if args.use_elementwise_plugin:
+        network.plugin_config.set_elementwise_plugin(dtype=args.use_elementwise_plugin)
+    if args.use_cast_plugin:
+        network.plugin_config.set_cast_plugin()
+
+    # Quantization plugins.
+    if args.use_smooth_quant:
+        network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype)
+        network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype)
+        network.plugin_config.set_quantize_tensor_plugin()
+        network.plugin_config.set_quantize_per_token_plugin()
+    assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
+    if args.enable_context_fmha:
+        network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
+    if args.enable_context_fmha_fp32_acc:
+        network.plugin_config.set_context_fmha(
+            ContextFMHAType.enabled_with_fp32_acc)
+    if args.multi_block_mode:
+        network.plugin_config.enable_mmha_multi_block_mode()
+    if not args.disable_xqa:
+        network.plugin_config.enable_xqa_optimization()
+
+    if args.use_weight_only and not args.disable_weight_only_quant_plugin:
+        if args.per_group:
+            network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin(
+                dtype=args.dtype)
+        else:
+            network.plugin_config.set_weight_only_quant_matmul_plugin(
+                dtype=args.dtype)
+    if args.world_size > 1:
+        network.plugin_config.set_nccl_plugin(args.dtype,
+                                              args.use_custom_all_reduce)
+    if args.remove_input_padding:
+        network.plugin_config.enable_remove_input_padding()
+    if args.paged_kv_cache:
+        network.plugin_config.enable_paged_kv_cache(args.tokens_per_block)
+    return
+
+
+def build_rank_engine(builder: Builder,
+                      builder_config: tensorrt_llm.builder.BuilderConfig,
+                      engine_name, rank, args):
+    '''
+       @brief: Build the engine on the given rank.
+       @param rank: The rank to build the engine.
+       @param args: The cmd line arguments.
+       @return: The built engine.
+    '''
+    dtype = str_dtype_to_trt(args.dtype)
+    mapping = Mapping(world_size=args.world_size,
+                      rank=rank,
+                      tp_size=args.tp_size,
+                      pp_size=args.pp_size)
+
+    assert args.n_layer % args.pp_size == 0, \
+        f"num_layers {args.n_layer} must be a multiple of pipeline parallelism size {args.pp_size}"
+
+    # FIXME (Not Support libnvidia-ml.so)
+    # profiler.print_memory_usage(f'Rank {rank} Engine build starts')
+    # Initialize Module
+    tensorrt_llm_llama = get_model_object(args,
+                                          mapping=mapping,
+                                          trt_dtype=dtype)
+    
+    # FIXME (Not Support libnvidia-ml.so)
+    # profiler.print_memory_usage(f'Rank {rank} model weight loaded.')
+
+    # Module -> Network
+    logger.debug("[Python]llama exampels, convert module to network....")
+    network = builder.create_network()
+    network.trt_network.name = engine_name
+    update_plugin_configs(args, network)
+
+    if args.use_paged_context_fmha:
+        assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc, "context fmha must be enabled"
+        network.plugin_config.set_paged_context_fmha()
+        
+    logger.debug(f"[Python]llama exampels, network.plugin_config: \n{network.plugin_config}")
+    with net_guard(network):
+        # Prepare
+        network.set_named_parameters(tensorrt_llm_llama.named_parameters())
+
+        # Forward
+        inputs = tensorrt_llm_llama.prepare_inputs(
+            max_batch_size=args.max_batch_size,
+            max_input_len=args.max_input_len,
+            max_seq_len=args.max_input_len + args.max_output_len,
+            use_cache=True,
+            max_beam_width=args.max_beam_width,
+            max_num_tokens=args.max_num_tokens,
+            prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+            gather_context_logits=args.gather_context_logits,
+            gather_generation_logits=args.gather_generation_logits,
+            lora_target_modules=args.lora_target_modules)
+        logger.info(f"[Python]llama exampels, forward....\n")
+        tensorrt_llm_llama(*inputs)
+        logger.info(f"[Python]llama exampels, forward finished\n")
+        if args.enable_debug_output:
+            # mark intermediate nodes' outputs
+            for k, v in tensorrt_llm_llama.named_network_outputs():                
+                logger.debug(f"enable_debug_output, debug tensor name: {k}")
+                v = v.trt_tensor
+                v.name = k
+                network.trt_network.mark_output(v)
+                v.dtype = dtype
+        if args.visualize:
+            model_path = os.path.join(args.output_dir, 'test.onnx')
+            to_onnx(network.trt_network, model_path)
+            
+    logger.debug("[Python]llama examples, tensorrt_llm.graph_rewriting.optimize....")
+    tensorrt_llm.graph_rewriting.optimize(network)
+
+    engine = None
+
+    # Network -> Engine
+    logger.debug("[Python]llama examples, builder.build_engine....")
+    engine = builder.build_engine(network, builder_config)
+    if rank == 0:
+        config_path = os.path.join(args.output_dir, 'config.json')
+        builder.save_config(builder_config, config_path)
+
+    return engine
+
+
+def get_builder_config_namespace(args, cache):
+    # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT
+    # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ)
+    # OR INT8 KV cache is set to contiguous (without paged KV cache enabled).
+    int8_trt_flag = (args.quant_mode.has_act_or_weight_quant()
+                     and not args.quant_mode.has_per_group_scaling()) or (
+                         not args.paged_kv_cache
+                         and args.quant_mode.has_int8_kv_cache())
+    config = argparse.Namespace(
+        name=MODEL_NAME,
+        precision=args.dtype,
+        timing_cache=args.timing_cache if cache is None else cache,
+        profiling_verbosity=args.profiling_verbosity,
+        tensor_parallel=args.tp_size,
+        pipeline_parallel=args.pp_size,
+        parallel_build=args.parallel_build,
+        num_layers=args.n_layer,
+        num_heads=args.n_head,
+        num_kv_heads=args.n_kv_head,
+        hidden_size=args.n_embd,
+        vocab_size=args.vocab_size,
+        hidden_act=args.hidden_act,
+        max_position_embeddings=args.n_positions,
+        max_batch_size=args.max_batch_size,
+        max_beam_width=args.max_beam_width,
+        max_input_len=args.max_input_len,
+        max_output_len=args.max_output_len,
+        max_num_tokens=args.max_num_tokens,
+        int8=int8_trt_flag,
+        quant_mode=args.quant_mode,
+        strongly_typed=args.strongly_typed,
+        opt_level=args.builder_opt,
+        max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+        gather_context_logits=args.gather_context_logits,
+        gather_generation_logits=args.gather_generation_logits,
+        lora_target_modules=args.lora_target_modules,
+        mlp_hidden_size=args.inter_size,
+        hf_modules_to_trtllm_modules=args.lora_config.
+        hf_modules_to_trtllm_modules,
+        trtllm_modules_to_hf_modules=args.lora_config.
+        trtllm_modules_to_hf_modules,
+    )
+    return config
+
+
+def build(rank, args):
+    torch.cuda.set_device(rank % args.gpus_per_node)
+    logger.set_level(args.log_level)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # when doing serializing build, all ranks share one engine
+    builder = Builder()
+    cache = None
+    for cur_rank in range(args.world_size):
+        # skip other ranks if parallel_build is enabled
+        if args.parallel_build and cur_rank != rank:
+            continue
+        tik = time.time()
+
+        # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT
+        # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ)
+        # OR INT8 KV cache is set to contiguous (without paged KV cache enabled).
+        int8_trt_flag = (args.quant_mode.has_act_or_weight_quant()
+                         and not args.quant_mode.has_per_group_scaling()) or (
+                             not args.paged_kv_cache
+                             and args.quant_mode.has_int8_kv_cache())
+        builder_config = builder.create_builder_config(
+            **vars(get_builder_config_namespace(args, cache)))
+        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size,
+                                      args.pp_size, cur_rank)
+        logger.debug("[Python]llama example, build_rank_engine....")
+        engine = build_rank_engine(builder, builder_config, engine_name,
+                                   cur_rank, args)
+        assert engine is not None, f'Failed to build engine for rank {cur_rank}'
+
+        local_num_kv_heads = (args.n_kv_head + args.world_size -
+                              1) // args.world_size
+        kv_dtype = str_dtype_to_trt(args.dtype)
+        if args.quant_mode.has_int8_kv_cache():
+            kv_dtype = str_dtype_to_trt('int8')
+        elif args.quant_mode.has_fp8_kv_cache():
+            kv_dtype = str_dtype_to_trt('fp8')
+            
+        # FIXME (Not Support libnvidia-ml.so)
+        # profiler.check_gpt_mem_usage(
+        #     engine=engine,
+        #     kv_dtype=kv_dtype,
+        #     use_gpt_attention_plugin=args.use_gpt_attention_plugin,
+        #     paged_kv_cache=args.paged_kv_cache,
+        #     max_batch_size=args.max_batch_size,
+        #     max_beam_width=args.max_beam_width,
+        #     max_seq_len=args.max_input_len + args.max_output_len,
+        #     local_num_kv_heads=local_num_kv_heads,
+        #     head_size=args.n_embd / args.n_head,
+        #     num_layers=args.n_layer)
+
+        if cur_rank == 0:
+            # Use in-memory timing cache for multiple builder passes.
+            if not args.parallel_build:
+                cache = builder_config.trt_builder_config.get_timing_cache()
+
+        serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+        del engine
+        # FIXME (Not Support libnvidia-ml.so)
+        # profiler.print_memory_usage(f'Rank {cur_rank} Engine serialized')
+
+        tok = time.time()
+        t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+        logger.info(
+            f'Rank {cur_rank} Engine build time: {t} - {tok - tik} (sec)')
+
+    if rank == 0:
+        ok = builder.save_timing_cache(
+            builder_config, os.path.join(args.output_dir, "model.cache"))
+        assert ok, "Failed to save timing cache."
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    print(args)
+    tik = time.time()
+    if args.parallel_build and args.world_size > 1 and \
+            torch.cuda.device_count() >= args.world_size:
+        logger.warning(
+            f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.'
+        )
+        mp.spawn(build, nprocs=args.world_size, args=(args, ))
+    else:
+        args.parallel_build = False
+        logger.info('Serially build TensorRT engines.')
+        build(0, args)
+
+    tok = time.time()
+    build_engine_time = tok - tik
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Total time of building all {args.world_size} engines: {t}')
+    
+    if args.total_build_time_target != 0:
+        status = build_engine_time <= args.total_build_time_target
+        if status:
+            print("successful.")
+        else:
+            print(f"Build engine time check failed! Target: {args.total_build_time_target}, Actual: {build_engine_time}")
+        sys.exit(int(not status))
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/run.py b/models/nlp/large_language_model/llama2-70b/trtllm/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..3899ec9d55a33bca6eeeac4840353345467b474d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/run.py
@@ -0,0 +1,539 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import csv
+from pathlib import Path
+import sys
+import time
+
+import numpy as np
+import torch
+import tensorrt_llm
+import tensorrt_llm.profiler
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
+
+from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
+                   load_tokenizer, read_model_name, throttle_generator)
+
+if PYTHON_BINDINGS:
+    from tensorrt_llm.runtime import ModelRunnerCpp
+
+
+def parse_arguments(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--max_output_len', type=int, required=True)
+    parser.add_argument(
+        '--max_attention_window_size',
+        type=int,
+        default=None,
+        help=
+        'The attention window size that controls the sliding window attention / cyclic kv cache behaviour'
+    )
+    parser.add_argument('--sink_token_length',
+                        type=int,
+                        default=None,
+                        help='The sink token length.')
+    parser.add_argument('--log_level', type=str, default='error')
+    parser.add_argument('--engine_dir', type=str, default='engine_outputs')
+    parser.add_argument('--use_py_session',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to use Python runtime session")
+    parser.add_argument(
+        '--input_text',
+        type=str,
+        nargs='+',
+        default=["Born in north-east France, Soyer trained as a"])
+    parser.add_argument(
+        '--no_prompt_template',
+        dest='use_prompt_template',
+        default=True,
+        action='store_false',
+        help=
+        "Whether or not to use default prompt template to wrap the input text.")
+    parser.add_argument(
+        '--input_file',
+        type=str,
+        help=
+        'CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default=None)
+    parser.add_argument('--max_input_length', type=int, default=923)
+    parser.add_argument('--output_csv',
+                        type=str,
+                        help='CSV file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument('--output_npy',
+                        type=str,
+                        help='Numpy file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument(
+        '--output_logits_npy',
+        type=str,
+        help=
+        'Numpy file where the generation logits are stored. Use only when num_beams==1',
+        default=None)
+    parser.add_argument('--tokenizer_dir',
+                        help="HF tokenizer config path",
+                        default='gpt2')
+    parser.add_argument(
+        '--tokenizer_type',
+        help=
+        'Specify that argument when providing a .model file as the tokenizer_dir. '
+        'It allows AutoTokenizer to instantiate the correct tokenizer type.')
+    parser.add_argument('--vocab_file',
+                        help="Used for sentencepiece tokenizers")
+    parser.add_argument('--num_beams',
+                        type=int,
+                        help="Use beam search if num_beams >1",
+                        default=1)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_k', type=int, default=1)
+    parser.add_argument('--top_p', type=float, default=0.0)
+    parser.add_argument('--length_penalty', type=float, default=1.0)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
+    parser.add_argument('--presence_penalty', type=float, default=0.0)
+    parser.add_argument('--frequency_penalty', type=float, default=0.0)
+    parser.add_argument('--debug_mode',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to turn on the debug mode")
+    parser.add_argument('--no_add_special_tokens',
+                        dest='add_special_tokens',
+                        default=True,
+                        action='store_false',
+                        help="Whether or not to add special tokens")
+    parser.add_argument('--streaming', default=False, action='store_true')
+    parser.add_argument('--streaming_interval',
+                        type=int,
+                        help="How often to return tokens when streaming.",
+                        default=5)
+    parser.add_argument(
+        '--prompt_table_path',
+        type=str,
+        help="Path to .npy file, exported by nemo_prompt_convert.py")
+    parser.add_argument(
+        '--prompt_tasks',
+        help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0")
+    parser.add_argument('--lora_dir',
+                        type=str,
+                        default=None,
+                        nargs="+",
+                        help="The directory of LoRA weights")
+    parser.add_argument(
+        '--lora_task_uids',
+        type=str,
+        default=None,
+        nargs="+",
+        help="The list of LoRA task uids; use -1 to disable the LoRA module")
+    parser.add_argument('--lora_ckpt_source',
+                        type=str,
+                        default="hf",
+                        choices=["hf", "nemo"],
+                        help="The source of lora checkpoint.")
+    parser.add_argument(
+        '--num_prepend_vtokens',
+        nargs="+",
+        type=int,
+        help="Number of (default) virtual tokens to prepend to each sentence."
+        " For example, '--num_prepend_vtokens=10' will prepend the tokens"
+        " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.")
+    parser.add_argument(
+        '--run_profiling',
+        default=False,
+        action='store_true',
+        help="Run several 10 iterations to profile the inference latencies.")
+    parser.add_argument(
+        '--medusa_choices',
+        type=str,
+        default=None,
+        help="Medusa choice to use, if not none, will use Medusa decoding."
+        "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
+    )
+    parser.add_argument('--target_load_engine_time',
+                        type=float,
+                        default=0)
+    parser.add_argument('--target_qps',
+                        type=float,
+                        default=0)
+
+    return parser.parse_args(args=args)
+
+
+def parse_input(tokenizer,
+                input_text=None,
+                prompt_template=None,
+                input_file=None,
+                add_special_tokens=True,
+                max_input_length=923,
+                pad_id=None,
+                num_prepend_vtokens=[],
+                model_name=None):
+    if pad_id is None:
+        pad_id = tokenizer.pad_token_id
+
+    batch_input_ids = []
+    if input_file is None:
+        for curr_text in input_text:
+            if prompt_template is not None:
+                curr_text = prompt_template.format(input_text=curr_text)
+            input_ids = tokenizer.encode(curr_text,
+                                         add_special_tokens=add_special_tokens,
+                                         truncation=True,
+                                         max_length=max_input_length)
+            batch_input_ids.append(input_ids)
+    else:
+        if input_file.endswith('.csv'):
+            with open(input_file, 'r') as csv_file:
+                csv_reader = csv.reader(csv_file, delimiter=',')
+                for line in csv_reader:
+                    input_ids = np.array(line, dtype='int32')
+                    batch_input_ids.append(input_ids[-max_input_length:])
+        elif input_file.endswith('.npy'):
+            inputs = np.load(input_file)
+            for row in inputs:
+                input_ids = row[row != pad_id]
+                batch_input_ids.append(input_ids[-max_input_length:])
+        elif input_file.endswith('.txt'):
+            with open(input_file, 'r', encoding='utf-8',
+                      errors='replace') as txt_file:
+                input_text = txt_file.read()
+                input_ids = tokenizer.encode(
+                    input_text,
+                    add_special_tokens=add_special_tokens,
+                    truncation=True,
+                    max_length=max_input_length)
+                batch_input_ids.append(input_ids)
+        else:
+            print('Input file format not supported.')
+            raise SystemExit
+
+    if num_prepend_vtokens:
+        assert len(num_prepend_vtokens) == len(batch_input_ids)
+        base_vocab_size = tokenizer.vocab_size - len(
+            tokenizer.special_tokens_map.get('additional_special_tokens', []))
+        for i, length in enumerate(num_prepend_vtokens):
+            batch_input_ids[i] = list(
+                range(base_vocab_size,
+                      base_vocab_size + length)) + batch_input_ids[i]
+    if model_name == 'glm_10b':
+        for ids in batch_input_ids:
+            ids.append(tokenizer.sop_token_id)
+    batch_input_ids = [
+        torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
+    ]
+    return batch_input_ids
+
+
+def print_output(tokenizer,
+                 output_ids,
+                 input_lengths,
+                 sequence_lengths,
+                 output_csv=None,
+                 output_npy=None,
+                 context_logits=None,
+                 generation_logits=None,
+                 output_logits_npy=None):
+    batch_size, num_beams, _ = output_ids.size()
+    if output_csv is None and output_npy is None:
+        for batch_idx in range(batch_size):
+            inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
+            )
+            input_text = tokenizer.decode(inputs)
+            print(f'Input [Text {batch_idx}]: \"{input_text}\"')
+            for beam in range(num_beams):
+                output_begin = input_lengths[batch_idx]
+                output_end = sequence_lengths[batch_idx][beam]
+                outputs = output_ids[batch_idx][beam][
+                    output_begin:output_end].tolist()
+                output_text = tokenizer.decode(outputs)
+                print(
+                    f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"')
+
+    output_ids = output_ids.reshape((-1, output_ids.size(2)))
+
+    if output_csv is not None:
+        output_file = Path(output_csv)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = output_ids.tolist()
+        with open(output_file, 'w') as csv_file:
+            writer = csv.writer(csv_file, delimiter=',')
+            writer.writerows(outputs)
+
+    if output_npy is not None:
+        output_file = Path(output_npy)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
+        np.save(output_file, outputs)
+
+    # Save context logits
+    if context_logits is not None and output_logits_npy is not None:
+        context_logits = torch.cat(context_logits, axis=0)
+        vocab_size_padded = context_logits.shape[-1]
+        context_logits = context_logits.reshape([1, -1, vocab_size_padded])
+
+        output_context_logits_npy = output_logits_npy.split(
+            '.npy')[0] + "_context"
+        output_context_logits_file = Path(output_context_logits_npy)
+        context_outputs = np.array(
+            context_logits.squeeze(0).cpu().contiguous(),
+            dtype='float32')  # [promptLengthSum, vocabSize]
+        np.save(output_context_logits_file, context_outputs)
+
+    # Save generation logits
+    if generation_logits is not None and output_logits_npy is not None and num_beams == 1:
+        output_generation_logits_npy = output_logits_npy.split(
+            '.npy')[0] + "_generation"
+        output_generation_logits_file = Path(output_generation_logits_npy)
+        generation_outputs = np.array(generation_logits.cpu().contiguous(),
+                                      dtype='float32')
+        np.save(output_generation_logits_file, generation_outputs)
+
+
+def check_status(args, load_engine_time, qps):
+    print("==================== check status ====================")
+    successful = True
+    if args.target_load_engine_time != 0 and load_engine_time > args.target_load_engine_time:
+        print(f"Load engine time check failed! Target: {args.target_load_engine_time}, Actual: {load_engine_time}")
+        successful = False
+    if args.target_qps != 0 and qps < args.target_qps:
+        print(f"Performance check failed! Target: {args.target_qps}, Actual: {qps}")
+        successful = False
+    return successful
+
+
+def main(args):
+    runtime_rank = tensorrt_llm.mpi_rank()
+    logger.set_level(args.log_level)
+
+    model_name = read_model_name(args.engine_dir)
+    if args.tokenizer_dir is None:
+        args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name]
+
+    tokenizer, pad_id, end_id = load_tokenizer(
+        tokenizer_dir=args.tokenizer_dir,
+        vocab_file=args.vocab_file,
+        model_name=model_name,
+        tokenizer_type=args.tokenizer_type,
+    )
+
+    # # An example to stop generation when the model generate " London" on first sentence, " eventually became" on second sentence
+    # stop_words_list = [[" London"], ["eventually became"]]
+    # stop_words_list = tensorrt_llm.runtime.to_word_list_format(stop_words_list, tokenizer)
+    # stop_words_list = torch.Tensor(stop_words_list).to(torch.int32).to("cuda").contiguous()
+    stop_words_list = None
+
+    # # An example to prevent generating " chef" on first sentence, " eventually" and " chef before" on second sentence
+    # bad_words_list = [[" chef"], [" eventually, chef before"]]
+    # bad_words_list = tensorrt_llm.runtime.to_word_list_format(bad_words_list, tokenizer)
+    # bad_words_list = torch.Tensor(bad_words_list).to(torch.int32).to("cuda").contiguous()
+    bad_words_list = None
+
+    prompt_template = None
+    if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES:
+        prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name]
+    batch_input_ids = parse_input(tokenizer=tokenizer,
+                                  input_text=args.input_text,
+                                  prompt_template=prompt_template,
+                                  input_file=args.input_file,
+                                  add_special_tokens=args.add_special_tokens,
+                                  max_input_length=args.max_input_length,
+                                  pad_id=pad_id,
+                                  num_prepend_vtokens=args.num_prepend_vtokens,
+                                  model_name=model_name)
+    input_lengths = [x.size(0) for x in batch_input_ids]
+
+    if not PYTHON_BINDINGS and not args.use_py_session:
+        logger.warning(
+            "Python bindings of C++ session is unavailable, fallback to Python session."
+        )
+        args.use_py_session = True
+    if args.debug_mode and not args.use_py_session:
+        logger.warning(
+            "Debug mode is not supported in C++ session for now, fallback to Python session."
+        )
+        args.use_py_session = True
+    runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
+    runner_kwargs = dict(engine_dir=args.engine_dir,
+                         lora_dir=args.lora_dir,
+                         rank=runtime_rank,
+                         debug_mode=args.debug_mode,
+                         lora_ckpt_source=args.lora_ckpt_source)
+    if args.medusa_choices is not None:
+        args.medusa_choices = ast.literal_eval(args.medusa_choices)
+        assert args.use_py_session, "Medusa is only supported by py_session"
+        assert args.temperature == 0, "Medusa should use temperature == 0"
+        assert args.num_beams == 1, "Medusa should use num_beams == 1"
+        runner_kwargs.update(medusa_choices=args.medusa_choices)
+    if not args.use_py_session:
+        runner_kwargs.update(
+            max_batch_size=len(batch_input_ids),
+            max_input_len=max(input_lengths),
+            max_output_len=args.max_output_len,
+            max_beam_width=args.num_beams,
+            max_attention_window_size=args.max_attention_window_size,
+            sink_token_length=args.sink_token_length,
+        )
+    runner = runner_cls.from_dir(**runner_kwargs)
+
+    torch.cuda.synchronize()
+    start_time = time.time()
+    with torch.no_grad():
+        outputs = runner.generate(
+            batch_input_ids,
+            max_new_tokens=args.max_output_len,
+            max_attention_window_size=args.max_attention_window_size,
+            sink_token_length=args.sink_token_length,
+            end_id=end_id,
+            pad_id=pad_id,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            num_beams=args.num_beams,
+            length_penalty=args.length_penalty,
+            repetition_penalty=args.repetition_penalty,
+            presence_penalty=args.presence_penalty,
+            frequency_penalty=args.frequency_penalty,
+            stop_words_list=stop_words_list,
+            bad_words_list=bad_words_list,
+            lora_uids=args.lora_task_uids,
+            prompt_table_path=args.prompt_table_path,
+            prompt_tasks=args.prompt_tasks,
+            streaming=args.streaming,
+            output_sequence_lengths=True,
+            return_dict=True,
+            medusa_choices=args.medusa_choices)
+        torch.cuda.synchronize()
+        
+    status = False
+    end_time = time.time()
+    if runtime_rank == 0:
+        num_inputs = sum([torch.numel(x) for x in batch_input_ids])
+        num_outputs = torch.numel(outputs["output_ids"])
+        num_gens = num_outputs - num_inputs
+        
+        load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine")
+        qps = num_gens/(end_time-start_time)
+        logger.info(f'Load engine takes: {load_engine_time} sec')
+        print(f"input tokens: {num_inputs}, generate tokens: {num_gens}, QPS: {qps}")
+        status = check_status(args, load_engine_time, qps)
+    else:
+        status = True
+
+    if args.streaming:
+        for curr_outputs in throttle_generator(outputs,
+                                               args.streaming_interval):
+            if runtime_rank == 0:
+                output_ids = curr_outputs['output_ids']
+                sequence_lengths = curr_outputs['sequence_lengths']
+                print_output(tokenizer,
+                             output_ids,
+                             input_lengths,
+                             sequence_lengths,
+                             output_csv=args.output_csv,
+                             output_npy=args.output_npy)
+    else:
+        if runtime_rank == 0:
+            output_ids = outputs['output_ids']
+            sequence_lengths = outputs['sequence_lengths']
+            context_logits = None
+            generation_logits = None
+            if runner.gather_context_logits:
+                context_logits = outputs['context_logits']
+            if runner.gather_generation_logits:
+                generation_logits = outputs['generation_logits']
+            print_output(tokenizer,
+                         output_ids,
+                         input_lengths,
+                         sequence_lengths,
+                         output_csv=args.output_csv,
+                         output_npy=args.output_npy,
+                         context_logits=context_logits,
+                         generation_logits=generation_logits,
+                         output_logits_npy=args.output_logits_npy)
+
+    if args.run_profiling:
+        ite = 10
+        # warmup
+        for _ in range(ite):
+            with torch.no_grad():
+                outputs = runner.generate(
+                    batch_input_ids,
+                    max_new_tokens=args.max_output_len,
+                    max_attention_window_size=args.max_attention_window_size,
+                    end_id=end_id,
+                    pad_id=pad_id,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    length_penalty=args.length_penalty,
+                    repetition_penalty=args.repetition_penalty,
+                    presence_penalty=args.presence_penalty,
+                    frequency_penalty=args.frequency_penalty,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    lora_uids=args.lora_task_uids,
+                    prompt_table_path=args.prompt_table_path,
+                    prompt_tasks=args.prompt_tasks,
+                    streaming=args.streaming,
+                    output_sequence_lengths=True,
+                    return_dict=True)
+                torch.cuda.synchronize()
+
+        tensorrt_llm.profiler.start("tmp")
+        for _ in range(ite):
+            with torch.no_grad():
+                outputs = runner.generate(
+                    batch_input_ids,
+                    max_new_tokens=args.max_output_len,
+                    max_attention_window_size=args.max_attention_window_size,
+                    end_id=end_id,
+                    pad_id=pad_id,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    length_penalty=args.length_penalty,
+                    repetition_penalty=args.repetition_penalty,
+                    presence_penalty=args.presence_penalty,
+                    frequency_penalty=args.frequency_penalty,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    lora_uids=args.lora_task_uids,
+                    prompt_table_path=args.prompt_table_path,
+                    prompt_tasks=args.prompt_tasks,
+                    streaming=args.streaming,
+                    output_sequence_lengths=True,
+                    return_dict=True)
+                torch.cuda.synchronize()
+        tensorrt_llm.profiler.stop("tmp")
+
+        print(
+            f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec"
+        )
+    if status:
+        print("successful.")
+    else:
+        print("failed.")
+    sys.exit(int(not status))
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    print(args)
+    main(args)
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/requirements.txt b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f7cbbb8b7e9bbd8aab6303fd8b5de1dacbd353b8
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/requirements.txt
@@ -0,0 +1,30 @@
+accelerate
+build
+colored
+# cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image.
+diffusers
+lark
+mpi4py
+numpy
+onnx>=1.12.0
+polygraphy
+psutil
+pybind11
+pynvml>=11.5.0
+sentencepiece>=0.1.99
+# tensorrt==9.2.0.post12.dev5
+# torch
+# nvidia-ammo~=0.5.0; platform_machine=="x86_64"
+transformers
+wheel
+optimum
+evaluate
+janus
+parameterized
+scikit-learn
+
+# special
+scipy==1.11.4
+pandas==1.5.3
+nltk
+rouge_score
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/set_environment.sh b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/set_environment.sh
new file mode 100644
index 0000000000000000000000000000000000000000..97e20ae79b78f7a6f65ba0837002389a85ae0e7d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/set_environment.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+
+PROJECT_DIR=$1
+
+pip3 install -r "$PROJECT_DIR/scripts/requirements.txt"
+
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8.sh b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8.sh
new file mode 100644
index 0000000000000000000000000000000000000000..042f1fd06abedbe3b856a726ff15a07982426cd6
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+EXIT_STATUS=0
+LOG_LEVEL=info
+BS=${BS:-1}
+DTYPE=${DTYPE:-"float16"}
+
+PROJECT_DIR="./"
+
+DATASET_DIR=${DATASET_DIR:-"${PROJECT_DIR}/data/datasets_cnn_dailymail"}
+MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-70b-chat"}
+ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}/checkpoints/"}
+
+export TLLM_LOG_LEVEL=${LOG_LEVEL}
+export PLUGIN_DTYPE="float16"
+
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+
+export TASK_DATA_PATH=${DATASET_DIR}
+
+# target is 95% of best (load engine time: 14.65, rouge1: 29.19, tps: 18.59)
+mpirun -n 8 --allow-run-as-root \
+python3 ${PROJECT_DIR}/summarize.py \
+--test_trt_llm \
+--log_level ${LOG_LEVEL} \
+--batch_size ${BS}  \
+--data_type ${DTYPE} \
+--hf_model_dir ${MODEL_DIR} \
+--tokenizer_dir ${MODEL_DIR} \
+--tokenizer_type "llama" \
+--engine_dir ${ENGINE_DIR}  \
+--target_load_engine_time 15.4 \
+--tensorrt_llm_rouge1_threshold 27.73    \
+--target_tps 17.66  \
+--use_py_session "$@"; check_status
+exit ${EXIT_STATUS}
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8_build.sh b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8_build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..27af165062b22f46443d127e9da2ce8057f72b7b
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8_build.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+EXIT_STATUS=0
+LOG_LEVEL=info
+BS=${BS:-1}
+DTYPE=${DTYPE:-"float16"}
+
+PROJECT_DIR="./"
+
+MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-70b-chat"}
+OUTPUT_DIR=${OUTPUT_DIR:-"${PROJECT_DIR}/checkpoints/"}
+
+export TLLM_LOG_LEVEL=${LOG_LEVEL}
+export PLUGIN_DTYPE="float16"
+
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+python3 ${PROJECT_DIR}/build.py \
+--log_level ${LOG_LEVEL} \
+--dtype ${DTYPE} \
+--model_dir ${MODEL_DIR} \
+--remove_input_padding \
+--use_gpt_attention_plugin float16 --use_gemm_plugin float16 \
+--enable_context_fmha \
+--world_size 8 \
+--tp_size 8 \
+--output_dir ${OUTPUT_DIR} "$@"; check_status
+exit ${EXIT_STATUS}
+
+
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/summarize.py b/models/nlp/large_language_model/llama2-70b/trtllm/summarize.py
new file mode 100644
index 0000000000000000000000000000000000000000..acf06abd7708c098c30a40bc905a52d84d83deb6
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/summarize.py
@@ -0,0 +1,724 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import os
+from pathlib import Path
+import sys
+import time
+
+import evaluate
+import numpy as np
+import torch
+from datasets import load_dataset, load_from_disk
+from transformers import (AutoModel, AutoModelForCausalLM,
+                          AutoModelForSeq2SeqLM, GenerationConfig)
+from utils import DEFAULT_HF_MODEL_DIRS, load_tokenizer, read_model_name
+
+import tensorrt_llm
+import tensorrt_llm.profiler as profiler
+from tensorrt_llm._utils import str_dtype_to_torch
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
+from tensorrt_llm.tools.ppl import ppl
+
+if PYTHON_BINDINGS:
+    from tensorrt_llm.runtime import ModelRunnerCpp
+
+
+def check_status(args, load_engine_time, rouge1, tps):
+    print("==================== check status ====================")
+    successful = True
+    if args.target_load_engine_time != 0 and load_engine_time > args.target_load_engine_time:
+        print(f"Load engine time check failed! Target: {args.target_load_engine_time}, Actual: {load_engine_time}")
+        successful = False
+    if rouge1 < args.tensorrt_llm_rouge1_threshold:
+        print(f"Accuracy check failed! Target: {args.tensorrt_llm_rouge1_threshold}%, Actual: {rouge1}%")
+        successful = False
+    if args.target_tps != 0 and tps < args.target_tps:
+        print(f"Performance check failed! Target: {args.target_tps}, Actual: {tps}")
+        successful = False
+    return successful
+
+
+def main(args):
+    runtime_rank = tensorrt_llm.mpi_rank()
+    logger.set_level(args.log_level)
+
+    model_name = read_model_name(args.engine_dir)
+    if args.hf_model_dir is None:
+        args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name]
+    if args.tokenizer_dir is None:
+        args.tokenizer_dir = args.hf_model_dir
+
+    test_hf = args.test_hf and runtime_rank == 0  # only run hf on rank 0
+    test_trt_llm = args.test_trt_llm
+    profiler.start('load tokenizer')
+    tokenizer, pad_id, end_id = load_tokenizer(
+        tokenizer_dir=args.tokenizer_dir,
+        vocab_file=args.vocab_file,
+        model_name=model_name,
+        tokenizer_type=args.tokenizer_type,
+    )
+    profiler.stop('load tokenizer')
+    logger.info(
+        f'Load tokenizer takes: {profiler.elapsed_time_in_sec("load tokenizer")} sec'
+    )
+
+    if args.eval_task == 'code_completion':
+        dataset_name = "openai_humaneval"
+        dataset_revision = None
+        dataset_input_key = 'prompt'
+        dataset_output_key = 'canonical_solution'
+        dataset_split = 'test'
+    elif args.eval_task == 'summarize':
+        dataset_name = "ccdv/cnn_dailymail"
+        dataset_revision = "3.0.0"
+        dataset_input_key = 'article'
+        dataset_output_key = 'highlights'
+        dataset_split = 'test'
+    elif args.eval_task == 'summarize_long':
+        dataset_name = "tau/zero_scrolls"
+        dataset_revision = 'squality'
+        dataset_input_key = 'input'
+        dataset_output_key = 'output'
+        dataset_split = 'validation'  # only this split contains reference strings
+    
+    
+    logger.info(f"prepare datasets....")
+    if os.getenv("TASK_DATA_PATH"):
+        dataset = load_from_disk(os.getenv("TASK_DATA_PATH"))[dataset_split]
+    else:
+        # dataset = load_dataset(dataset_name,
+        #                     dataset_revision,
+        #                     cache_dir=args.dataset_path,
+        #                     split=dataset_split,
+        #                     trust_remote_code=True)
+        
+        dataset = load_dataset(dataset_name,
+                            dataset_revision,
+                            cache_dir=args.dataset_path,
+                            split=dataset_split)
+
+    logger.info(f"datasets is ready.")
+    max_batch_size = args.batch_size
+
+    # runtime parameters
+    top_k = args.top_k
+    top_p = args.top_p
+    output_len = args.output_len
+    test_token_num = args.max_input_length
+    max_attention_window_size = args.max_attention_window_size
+    sink_token_length = args.sink_token_length
+
+    # random_seed = 5
+    temperature = args.temperature
+    num_beams = args.num_beams
+    length_penalty = args.length_penalty
+    repetition_penalty = args.repetition_penalty
+    presence_penalty = args.presence_penalty
+    frequency_penalty = args.frequency_penalty
+
+    if test_trt_llm:
+        if not PYTHON_BINDINGS and not args.use_py_session:
+            logger.warning(
+                "Python bindings of C++ session is unavailable, fallback to Python session."
+            )
+            args.use_py_session = True
+        runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
+        runner_kwargs = dict(engine_dir=args.engine_dir,
+                             rank=runtime_rank,
+                             debug_mode=args.debug_mode)
+        if args.medusa_choices is not None:
+            args.medusa_choices = ast.literal_eval(args.medusa_choices)
+            assert args.use_py_session, "Medusa is only supported by py_session"
+            assert args.temperature == 0, "Medusa should use temperature == 0"
+            assert args.num_beams == 1, "Medusa should use num_beams == 1"
+            runner_kwargs.update(medusa_choices=args.medusa_choices)
+        if not args.use_py_session:
+            runner_kwargs.update(
+                max_batch_size=max_batch_size,
+                max_input_len=test_token_num,
+                max_output_len=output_len,
+                max_beam_width=num_beams,
+                max_attention_window_size=max_attention_window_size,
+                sink_token_length=sink_token_length)
+        runner = runner_cls.from_dir(**runner_kwargs)
+        assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \
+            "PPL evaluation requires engine built with gather_all_token_logits enabled"
+
+    if test_hf:
+        profiler.start('load HF model')
+        dtype_alias_mapping = {
+            'fp32': 'float32',
+            'fp16': 'float16',
+            'bf16': 'bfloat16'
+        }
+        args.data_type = dtype_alias_mapping.get(args.data_type, args.data_type)
+        if model_name.startswith('chatglm'):
+            auto_model_cls = AutoModel
+        elif model_name.startswith('glm'):
+            auto_model_cls = AutoModelForSeq2SeqLM
+        else:
+            auto_model_cls = AutoModelForCausalLM
+        model = auto_model_cls.from_pretrained(
+            args.hf_model_dir,
+            trust_remote_code=True,
+            torch_dtype=str_dtype_to_torch(args.data_type),
+            device_map='auto' if args.hf_device_map_auto else None)
+        try:
+            model.to_bettertransformer()
+        except ValueError as e:
+            logger.warning(
+                f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}'
+            )
+        if not args.hf_device_map_auto:
+            model.cuda()
+        if model_name == 'qwen':
+            model.generation_config = GenerationConfig.from_pretrained(
+                args.hf_model_dir, trust_remote_code=True)
+        profiler.stop('load HF model')
+        logger.info(
+            f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec'
+        )
+
+    output_dir = Path(args.output_dir) if args.output_dir else None
+    if output_dir is not None:
+        output_dir.mkdir(exist_ok=True, parents=True)
+        if test_trt_llm:
+            with (output_dir / 'trtllm.out').open('w') as f:
+                f.write(f'Engine path: {args.engine_dir}\n')
+                f.write(f'Tokenizer path: {args.tokenizer_dir}\n')
+        if test_hf:
+            with (output_dir / 'hf.out').open('w') as f:
+                f.write(f'Model path: {args.hf_model_dir}\n')
+                f.write(f'Tokenizer path: {args.tokenizer_dir}\n')
+
+    def _prepare_inputs(batch_input_texts,
+                        eval_task='summarize',
+                        add_special_tokens=True):
+        batch_size = len(batch_input_texts)
+        append_str = ' TL;DR: ' if eval_task == 'summarize' else ''
+        batch_input_ids = []
+        for i in range(batch_size):
+            curr_text = batch_input_texts[i] + append_str
+            curr_text = curr_text.strip().replace(" n't", "n't")
+
+            # TODO: The below lines are used to be compatible with the original code; may need fix
+            if model_name.startswith(('chatglm2', 'chatglm3')):
+                input_ids = tokenizer.encode(curr_text,
+                                             return_tensors='pt').squeeze(0)
+                input_ids = input_ids[:test_token_num]
+            elif model_name == 'qwen':
+                from qwen.utils.utils import make_context
+                # use make_content to generate prompt
+                system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user."
+                _, input_id_list = make_context(
+                    tokenizer=tokenizer,
+                    query=curr_text,
+                    history=[],
+                    system=system_prompt,
+                    max_input_length=test_token_num,
+                )
+                input_ids = torch.tensor(input_id_list)
+            else:
+                input_ids = tokenizer.encode(
+                    curr_text,
+                    return_tensors='pt',
+                    add_special_tokens=add_special_tokens,
+                    truncation=True,
+                    max_length=test_token_num).squeeze(0)
+
+            batch_input_ids.append(input_ids)
+        return batch_input_ids
+
+    def eval_trt_llm(datapoint,
+                     eval_task='summarize',
+                     eval_ppl=False,
+                     add_special_tokens=True):
+        batch_size = len(datapoint[dataset_input_key])
+        batch_input_ids = _prepare_inputs(datapoint[dataset_input_key],
+                                          eval_task=eval_task,
+                                          add_special_tokens=add_special_tokens)
+        input_lengths = [x.size(0) for x in batch_input_ids]
+
+        with torch.no_grad():
+            outputs = runner.generate(
+                batch_input_ids,
+                max_new_tokens=output_len,
+                max_attention_window_size=max_attention_window_size,
+                sink_token_length=sink_token_length,
+                end_id=end_id,
+                pad_id=pad_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                num_beams=num_beams,
+                length_penalty=length_penalty,
+                repetition_penalty=repetition_penalty,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                output_sequence_lengths=True,
+                return_dict=True,
+                medusa_choices=args.medusa_choices)
+            torch.cuda.synchronize()
+
+        # Extract a list of tensors of shape beam_width x output_ids.
+        if runtime_rank == 0:
+            output_ids = outputs['output_ids']
+            output_beams_list = [
+                tokenizer.batch_decode(output_ids[batch_idx, :,
+                                                  input_lengths[batch_idx]:],
+                                       skip_special_tokens=True)
+                for batch_idx in range(batch_size)
+            ]
+            output_ids_list = [
+                output_ids[batch_idx, :, input_lengths[batch_idx]:]
+                for batch_idx in range(batch_size)
+            ]
+
+            ppls = [[] for _ in range(batch_size)]
+            seq_lengths_array = outputs["sequence_lengths"].cpu().tolist()
+            lengths_info = {
+                'input_lengths': input_lengths,
+                'seq_lengths': seq_lengths_array
+            }
+            if eval_ppl:
+                seq_lengths = outputs['sequence_lengths']
+                context_logits = outputs['context_logits']
+                # Remove the first generation logits which are same to last context logits
+                generation_logits = outputs['generation_logits'][:, :, 1:]
+                for batch_idx in range(batch_size):
+                    # [batch, beam, step]
+                    for beam_idx in range(num_beams):
+                        curr_len = seq_lengths[batch_idx, beam_idx]
+                        curr_ctx_len = input_lengths[batch_idx]
+                        curr_gen_len = curr_len - curr_ctx_len
+
+                        curr_ids = output_ids[batch_idx, beam_idx, 1:curr_len]
+                        curr_logits = torch.cat([
+                            context_logits[batch_idx],
+                            generation_logits[batch_idx,
+                                              beam_idx, :curr_gen_len - 1]
+                        ],
+                                                dim=0)
+                        curr_ppl = ppl(curr_logits, curr_ids)
+                        logger.debug(
+                            f"TensorRT-LLM PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}"
+                        )
+                        ppls[batch_idx].append(curr_ppl)
+
+            return output_beams_list, output_ids_list, ppls, lengths_info
+        return [], [], [], {}
+
+    def eval_hf(datapoint,
+                eval_task='summarize',
+                eval_ppl=False,
+                add_special_tokens=True):
+        batch_size = len(datapoint[dataset_input_key])
+        if batch_size > 1:
+            logger.warning(
+                f"HF does not support batch_size > 1 to verify correctness due to padding. Current batch size is {batch_size}"
+            )
+        batch_input_ids = _prepare_inputs(datapoint[dataset_input_key],
+                                          eval_task=eval_task,
+                                          add_special_tokens=add_special_tokens)
+        input_lengths = [x.size(0) for x in batch_input_ids]
+        # Left padding for HF
+        max_length = max(input_lengths)
+        paddings = [
+            torch.ones(max_length - l, dtype=torch.int32) * pad_id
+            for l in input_lengths
+        ]
+        batch_input_ids = [
+            torch.cat([pad, x]) for x, pad in zip(batch_input_ids, paddings)
+        ]
+        batch_input_ids = torch.stack(batch_input_ids)
+        batch_input_ids = batch_input_ids.cuda()
+
+        with torch.no_grad():
+            outputs = model.generate(batch_input_ids,
+                                     max_new_tokens=output_len,
+                                     top_k=top_k,
+                                     temperature=temperature,
+                                     eos_token_id=end_id,
+                                     pad_token_id=pad_id,
+                                     num_beams=num_beams,
+                                     num_return_sequences=num_beams,
+                                     early_stopping=True,
+                                     length_penalty=length_penalty,
+                                     output_scores=True,
+                                     return_dict_in_generate=True)
+            if eval_ppl and batch_size == 1:
+                # model.generate cannot return context logits?
+                # Will cause additional latency
+                context_outputs = model(batch_input_ids)
+
+        output_ids = outputs['sequences']
+        tokens_list = output_ids[:, len(batch_input_ids[0]):].tolist()
+        output_ids = output_ids.reshape([batch_size, num_beams, -1])
+        output_lines_list = [
+            tokenizer.batch_decode(output_ids[:, i,
+                                              len(batch_input_ids[0]):],
+                                   skip_special_tokens=True)
+            for i in range(num_beams)
+        ]
+
+        ppls = [[] for _ in range(batch_size)]
+        if eval_ppl and batch_size == 1:
+            # Only for batch size of 1
+            seq_lens = (output_ids != end_id).logical_and(
+                output_ids != pad_id).sum(dim=-1)
+            context_logits = context_outputs['logits']
+            # Remove the first generation logits which are same to last context logits
+            generation_logits = torch.stack(outputs['scores'][1:], dim=1)
+            _, max_gen_len, voc_size = generation_logits.size()
+            generation_logits = generation_logits.view(batch_size, num_beams,
+                                                       max_gen_len, voc_size)
+            for batch_idx in range(batch_size):
+                for beam_idx in range(num_beams):
+                    curr_len = seq_lens[batch_idx, beam_idx]
+                    curr_ctx_len = input_lengths[batch_idx]
+                    curr_gen_len = curr_len - curr_ctx_len
+
+                    curr_ids = output_ids[batch_idx, beam_idx, 1:curr_len]
+                    curr_logits = torch.cat([
+                        context_logits[batch_idx],
+                        generation_logits[batch_idx,
+                                          beam_idx, :curr_gen_len - 1]
+                    ],
+                                            dim=0)
+                    curr_ppl = ppl(curr_logits, curr_ids)
+                    logger.debug(
+                        f"HF PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}"
+                    )
+                    ppls[batch_idx].append(curr_ppl)
+
+        return output_lines_list, tokens_list, ppls
+
+    if test_trt_llm:
+        datapoint = dataset[0:1]
+        output, *_ = eval_trt_llm(datapoint,
+                                  eval_task=args.eval_task,
+                                  eval_ppl=args.eval_ppl,
+                                  add_special_tokens=args.add_special_tokens)
+        if runtime_rank == 0:
+            logger.info(
+                "---------------------------------------------------------")
+            logger.info("TensorRT-LLM Generated : ")
+            logger.info(f" Input : {datapoint[dataset_input_key]}")
+            logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
+            logger.info(f"\n Output : {output}")
+            logger.info(
+                "---------------------------------------------------------")
+    if test_hf:
+        datapoint = dataset[0:1]
+        output, *_ = eval_hf(datapoint,
+                             eval_task=args.eval_task,
+                             eval_ppl=args.eval_ppl,
+                             add_special_tokens=args.add_special_tokens)
+        logger.info("---------------------------------------------------------")
+        logger.info("HF Generated : ")
+        logger.info(f" Input : {datapoint[dataset_input_key]}")
+        logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
+        logger.info(f"\n Output : {output}")
+        logger.info("---------------------------------------------------------")
+
+    # TODO: Add random_seed flag in gptj
+    metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)]
+    metric_hf = [evaluate.load("rouge") for _ in range(num_beams)]
+    for i in range(num_beams):
+        metric_tensorrt_llm[i].seed = 0
+        metric_hf[i].seed = 0
+    ppls_trt_llm = [[] for _ in range(num_beams)]
+    ppls_hf = [[] for _ in range(num_beams)]
+
+    ite_count = 0
+    data_point_idx = 0
+    total_output_token_count_trt_llm = 0  # only valid for runtime_rank == 0
+    
+    if args.stability_test:
+        logger.info(f"stability test, need {args.stability_test_hours} hours")
+    else:
+        logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}")
+    stability_start_time = time.time()
+    while (data_point_idx < len(dataset)) and (ite_count < args.max_ite):
+        if runtime_rank == 0:
+            logger.debug(
+                f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}"
+            )
+        datapoint = dataset[data_point_idx:(data_point_idx + max_batch_size)]
+
+        if test_trt_llm:
+            profiler.start('tensorrt_llm')
+            output_tensorrt_llm, output_ids_trt_llm, curr_ppls_trt_llm, lengths_info = eval_trt_llm(
+                datapoint,
+                eval_task=args.eval_task,
+                eval_ppl=args.eval_ppl,
+                add_special_tokens=args.add_special_tokens)
+            profiler.stop('tensorrt_llm')
+            if runtime_rank == 0:
+                input_lengths = lengths_info['input_lengths']
+                seq_lengths = lengths_info['seq_lengths']
+                output_token_count_trt_llm = sum(
+                    seq_lengths[idx][0] - input_lengths[idx]
+                    for idx in range(len(input_lengths)))
+                total_output_token_count_trt_llm += output_token_count_trt_llm
+
+        if test_hf:
+            profiler.start('hf')
+            output_hf, _, curr_ppls_hf = eval_hf(
+                datapoint,
+                eval_task=args.eval_task,
+                eval_ppl=args.eval_ppl,
+                add_special_tokens=args.add_special_tokens)
+            profiler.stop('hf')
+
+        if runtime_rank == 0:
+            if test_trt_llm:
+                for batch_idx in range(len(output_tensorrt_llm)):
+                    for beam_idx in range(num_beams):
+                        metric_tensorrt_llm[beam_idx].add_batch(
+                            predictions=[
+                                output_tensorrt_llm[batch_idx][beam_idx]
+                            ],
+                            references=[
+                                datapoint[dataset_output_key][batch_idx]
+                            ])
+                        if args.eval_ppl:
+                            ppls_trt_llm[beam_idx].append(
+                                curr_ppls_trt_llm[batch_idx][beam_idx])
+                if output_dir is not None:
+                    # yapf: disable
+                    for i in range(len(output_tensorrt_llm[0])):
+                        for beam_idx in range(num_beams):
+                            with (output_dir / 'trtllm.out').open('a') as f:
+                                f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n')
+                    # yapf: enable
+            if test_hf:
+                for beam_idx in range(num_beams):
+                    for batch_idx in range(len(output_hf[beam_idx])):
+                        metric_hf[beam_idx].add_batch(
+                            predictions=[output_hf[beam_idx][batch_idx]],
+                            references=[
+                                datapoint[dataset_output_key][batch_idx]
+                            ])
+                        if args.eval_ppl and args.batch_size == 1:
+                            ppls_hf[beam_idx].append(
+                                curr_ppls_hf[batch_idx][beam_idx])
+                if output_dir is not None:
+                    # yapf: disable
+                    for i in range(len(output_hf[0])):
+                        for beam_idx in range(num_beams):
+                            with (output_dir / 'hf.out').open('a') as f:
+                                f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n')
+                    # yapf: enable
+
+            logger.debug('-' * 100)
+            logger.debug(f"Input : {datapoint[dataset_input_key]}")
+            if test_trt_llm:
+                logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}')
+            if test_hf:
+                logger.debug(f'HF Output: {output_hf}')
+            logger.debug(f"Reference : {datapoint[dataset_output_key]}")
+
+        data_point_idx += max_batch_size
+        ite_count += 1
+        
+        if args.stability_test:
+            test_time_hours = round((time.time() - stability_start_time)/3600, 1)
+            if test_time_hours > args.stability_test_hours:
+                if runtime_rank == 0:
+                    logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.")
+                break
+            else:
+                data_point_idx = data_point_idx % len(dataset)
+                ite_count = ite_count % args.max_ite
+                if runtime_rank == 0 and ite_count % 100 == 0:
+                    logger.info(f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours")
+        elif runtime_rank == 0 and ite_count % 10 == 0:
+            logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}")
+
+    if runtime_rank == 0:
+        if test_trt_llm:
+            np.random.seed(0)  # rouge score use sampling to compute the score
+            logger.info(
+                f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)'
+            )
+            logger.info(
+                f'TensorRT-LLM (total output tokens: {total_output_token_count_trt_llm})'
+            )
+            logger.info(
+                f'TensorRT-LLM (tokens per second: {total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm")})'
+            )
+            
+            rouge1 = 0
+            tps = total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm")
+            
+            for beam_idx in range(num_beams):
+                logger.info(f"TensorRT-LLM beam {beam_idx} result")
+                computed_metrics_tensorrt_llm = metric_tensorrt_llm[
+                    beam_idx].compute()
+                for key in computed_metrics_tensorrt_llm.keys():
+                    logger.info(
+                        f'  {key} : {computed_metrics_tensorrt_llm[key]*100}')
+
+                if args.check_accuracy and beam_idx == 0:
+                    assert computed_metrics_tensorrt_llm[
+                        'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold
+                    
+                if beam_idx == 0:
+                    rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100
+                    
+                if args.eval_ppl:
+                    logger.info(
+                        f"  Per-token perplexity: {np.mean(ppls_trt_llm[beam_idx])}"
+                    )
+                    if args.check_accuracy and beam_idx == 0:
+                        assert np.mean(ppls_trt_llm[beam_idx]
+                                       ) < args.tensorrt_llm_ppl_threshold
+                        
+            load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine")
+            logger.info(f'Load engine takes: {load_engine_time} sec')
+
+            status = check_status(args, load_engine_time, rouge1, tps)
+            if status:
+                print("successful.")
+            else:
+                print("failed.")
+            
+            sys.exit(int(not status))
+            
+        if test_hf:
+            np.random.seed(0)  # rouge score use sampling to compute the score
+            logger.info(
+                f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)'
+            )
+            for beam_idx in range(num_beams):
+                logger.info(f"HF beam {beam_idx} result")
+                computed_metrics_hf = metric_hf[beam_idx].compute()
+                for key in computed_metrics_hf.keys():
+                    logger.info(f'  {key} : {computed_metrics_hf[key]*100}')
+                if args.eval_ppl and args.batch_size == 1:
+                    logger.info(
+                        f"  Per-token perplexity: {np.mean(ppls_hf[beam_idx])}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None)
+    parser.add_argument(
+        '--tokenizer_dir',
+        default=None,
+        help='tokenizer path; defaults to hf_model_dir if left unspecified')
+    parser.add_argument(
+        '--tokenizer_type',
+        help=
+        'Specify that argument when providing a .model file as the tokenizer_dir. '
+        'It allows AutoTokenizer to instantiate the correct tokenizer type.')
+    parser.add_argument('--vocab_file')
+    parser.add_argument('--test_hf', action='store_true')
+    parser.add_argument('--test_trt_llm', action='store_true')
+    parser.add_argument(
+        '--data_type',
+        type=str,
+        choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'],
+        default='fp16')
+    parser.add_argument('--engine_dir', type=str, default='engine_outputs')
+    parser.add_argument('--use_py_session',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to use Python runtime session")
+    parser.add_argument(
+        '--eval_task',
+        type=str,
+        default='summarize',
+        choices=['summarize', 'summarize_long', 'code_completion'])
+    parser.add_argument('--check_accuracy', action='store_true')
+    parser.add_argument('--tensorrt_llm_rouge1_threshold',
+                        type=float,
+                        default=15.0)
+    parser.add_argument('--eval_ppl', action='store_true')
+    parser.add_argument('--tensorrt_llm_ppl_threshold',
+                        type=float,
+                        default=15.0)
+    parser.add_argument('--target_load_engine_time',
+                        type=float,
+                        default=0)
+    parser.add_argument('--target_tps',
+                        type=float,
+                        default=0)
+    parser.add_argument('--dataset_path', type=str, default='')
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--max_ite', type=int, default=20)
+    parser.add_argument('--output_len', type=int, default=100)
+    parser.add_argument('--max_input_length', type=int, default=923)
+    parser.add_argument(
+        '--max_attention_window_size',
+        type=int,
+        default=None,
+        help=
+        'The attention window size that controls the sliding window attention / cyclic kv cache behaviour'
+    )
+    parser.add_argument('--sink_token_length',
+                        type=int,
+                        default=None,
+                        help='The sink token length.')
+    parser.add_argument('--num_beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_k', type=int, default=1)
+    parser.add_argument('--top_p', type=float, default=0.0)
+    parser.add_argument('--length_penalty', type=float, default=1.0)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
+    parser.add_argument('--presence_penalty', type=float, default=0.0)
+    parser.add_argument('--frequency_penalty', type=float, default=0.0)
+    parser.add_argument('--debug_mode',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to turn on the debug mode")
+    parser.add_argument('--no_add_special_tokens',
+                        dest='add_special_tokens',
+                        default=True,
+                        action='store_false',
+                        help="Whether or not to add special tokens")
+    parser.add_argument(
+        '--hf_device_map_auto',
+        action='store_true',
+        help="Use device map 'auto' to load a pretrained HF model. This may "
+        "help to test a large model that cannot fit into a singlue GPU.")
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default=None,
+        help="Directory where to save output sentences. 'trtllm.out' for "
+        "TensorRT-LLM outputs, and 'hf.out' for HF outputs.  If None, do not "
+        "save outputs.")
+    parser.add_argument(
+        '--medusa_choices',
+        type=str,
+        default=None,
+        help="Medusa choice to use, if not none, will use Medusa decoding."
+        "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
+    )
+    parser.add_argument('--stability_test',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to run stability test for tensorrt_llm.")
+    parser.add_argument('--stability_test_hours', type=float, default=24.0)
+    args = parser.parse_args()
+    print(args)
+    main(args)
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/utils.py b/models/nlp/large_language_model/llama2-70b/trtllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..44042d9e2dcb44dd6cd917ab16a00010e4005202
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/utils.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+from typing import Optional
+
+from transformers import AutoTokenizer, T5Tokenizer
+
+import tensorrt_llm
+
+DEFAULT_HF_MODEL_DIRS = {
+    'baichuan': 'baichuan-inc/Baichuan-13B-Chat',
+    'bloom': 'bigscience/bloom-560m',
+    'chatglm_6b': 'THUDM/chatglm-6b',
+    'chatglm2_6b': 'THUDM/chatglm2-6b',
+    'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k',
+    'chatglm3_6b': 'THUDM/chatglm3-6b',
+    'chatglm3_6b_base': 'THUDM/chatglm3-6b-base',
+    'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k',
+    'falcon': 'tiiuae/falcon-rw-1b',
+    'glm_10b': 'THUDM/glm-10b',
+    'gpt': 'gpt2-medium',
+    'gptj': 'EleutherAI/gpt-j-6b',
+    'gptneox': 'EleutherAI/gpt-neox-20b',
+    'internlm': 'internlm/internlm-chat-7b',
+    'llama': 'meta-llama/Llama-2-7b-hf',
+    'mpt': 'mosaicml/mpt-7b',
+    'phi': 'microsoft/phi-2',
+    'opt': 'facebook/opt-350m',
+    'qwen': 'Qwen/Qwen-7B',
+}
+
+DEFAULT_PROMPT_TEMPLATES = {
+    'internlm':
+    "<|User|>:{input_text}<eoh>\n<|Bot|>:",
+    'qwen':
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
+}
+
+
+def read_model_name(engine_dir: str):
+    engine_version = tensorrt_llm.runtime.engine.get_engine_version(engine_dir)
+
+    with open(Path(engine_dir) / "config.json", 'r') as f:
+        config = json.load(f)
+
+    if engine_version is None:
+        return config['builder_config']['name']
+
+    return config['pretrained_config']['architecture']
+
+
+def throttle_generator(generator, stream_interval):
+    for i, out in enumerate(generator):
+        if not i % stream_interval:
+            yield out
+
+    if i % stream_interval:
+        yield out
+
+
+def load_tokenizer(tokenizer_dir: Optional[str] = None,
+                   vocab_file: Optional[str] = None,
+                   model_name: str = 'gpt',
+                   tokenizer_type: Optional[str] = None):
+    if vocab_file is None:
+        use_fast = True
+        if tokenizer_type is not None and tokenizer_type == "llama":
+            use_fast = False
+        # Should set both padding_side and truncation_side to be 'left'
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                  legacy=False,
+                                                  padding_side='left',
+                                                  truncation_side='left',
+                                                  trust_remote_code=True,
+                                                  tokenizer_type=tokenizer_type,
+                                                  use_fast=use_fast)
+    else:
+        # For gpt-next, directly load from tokenizer.model
+        assert model_name == 'gpt'
+        tokenizer = T5Tokenizer(vocab_file=vocab_file,
+                                padding_side='left',
+                                truncation_side='left')
+
+    if model_name == 'qwen':
+        with open(Path(tokenizer_dir) / "generation_config.json") as f:
+            gen_config = json.load(f)
+        chat_format = gen_config['chat_format']
+        if chat_format == 'raw':
+            pad_id = gen_config['pad_token_id']
+            end_id = gen_config['eos_token_id']
+        elif chat_format == 'chatml':
+            pad_id = tokenizer.im_end_id
+            end_id = tokenizer.im_end_id
+        else:
+            raise Exception(f"unknown chat format: {chat_format}")
+    elif model_name == 'glm_10b':
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eop_token_id
+    else:
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eos_token_id
+
+    return tokenizer, pad_id, end_id
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/README.md b/models/nlp/large_language_model/llama2-7b/trtllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa8b239b495c61e2b7562368e433a394e45cd207
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/README.md
@@ -0,0 +1,42 @@
+# LlaMa2 7B
+
+## Description
+we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.
+
+## Setup
+
+### Install
+```bash
+yum install mesa-libGL
+
+bash scripts/set_environment.sh .
+# Please contact the staff to obtain the relevant installlation packages.
+pip3 install Path/To/ixrt-xxx.whl
+pip3 install Path/To/tensorrt_llm-xxx.whl
+pip3 install Path/To/ixformer-xxx.whl
+```
+
+### Download
+-Model: https://huggingface.co/meta-llama/Llama-2-7b
+
+-Dataset:https://huggingface.co/datasets/cnn_dailymail
+
+```bash
+# Download model from the website and make sure the model's path is "data/llama2-7b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir data
+
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+wget --no-check-certificate https://raw.githubusercontent.com/huggingface/evaluate/main/metrics/rouge/rouge.py -P rouge
+```
+
+## Inference
+### FP16
+
+```bash
+# Build engine
+bash scripts/test_trtllm_llama2_7b_gpu1_build.sh
+# Inference
+bash scripts/test_trtllm_llama2_7b_gpu1.sh
+```
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/build.py b/models/nlp/large_language_model/llama2-7b/trtllm/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ff0c9eaa0cedfd382783a5cfcca9175bf38acad
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/build.py
@@ -0,0 +1,1163 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import math
+import os
+import sys
+import time
+from pathlib import Path
+
+# isort: off
+import torch
+import torch.multiprocessing as mp
+import tensorrt as trt
+# isort: on
+from transformers import LlamaConfig, LlamaForCausalLM
+
+try:
+    from transformers import MixtralForCausalLM
+except ImportError:
+    MixtralForCausalLM = None
+
+try:
+    from transformers import LlavaConfig, LlavaForConditionalGeneration
+except ImportError:
+    pass
+
+import tensorrt_llm
+from tensorrt_llm import profiler
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm._utils import str_dtype_to_trt
+from tensorrt_llm.builder import Builder
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.layers.attention import PositionEmbeddingType
+from tensorrt_llm.logger import logger
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import quantize_model
+from tensorrt_llm.network import net_guard
+from tensorrt_llm.plugin.plugin import ContextFMHAType
+from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.runtime.lora_manager import LoraConfig
+
+from tensorrt_llm.models.llama.weight import (  # isort:skip
+    get_scaling_factors, load_from_awq_llama, load_from_binary,
+    load_from_gptq_llama, load_from_hf_checkpoint, load_from_hf_llama,
+    load_from_meta_llama, parse_bin_config)
+
+MODEL_NAME = "llama"
+
+# 2 routines: get_engine_name, serialize_engine
+# are direct copy from gpt example, TODO: put in utils?
+
+import onnx
+from onnx import TensorProto, helper
+
+
+def trt_dtype_to_onnx(dtype):
+    if dtype == trt.float16:
+        return TensorProto.DataType.FLOAT16
+    if dtype == trt.bfloat16:
+        return TensorProto.DataType.BFLOAT16
+    elif dtype == trt.float32:
+        return TensorProto.DataType.FLOAT
+    elif dtype == trt.int32:
+        return TensorProto.DataType.INT32
+    elif dtype == trt.int64:
+        return TensorProto.DataType.INT64
+    elif dtype == trt.bool:
+        return TensorProto.DataType.BOOL
+    else:
+        raise TypeError("%s is not supported" % dtype)
+
+
+def to_onnx(network, path):
+    inputs = []
+    for i in range(network.num_inputs):
+        network_input = network.get_input(i)
+        inputs.append(
+            helper.make_tensor_value_info(
+                network_input.name, trt_dtype_to_onnx(network_input.dtype),
+                list(network_input.shape)))
+
+    outputs = []
+    for i in range(network.num_outputs):
+        network_output = network.get_output(i)
+        outputs.append(
+            helper.make_tensor_value_info(
+                network_output.name, trt_dtype_to_onnx(network_output.dtype),
+                list(network_output.shape)))
+
+    nodes = []
+    for i in range(network.num_layers):
+        layer = network.get_layer(i)
+        layer_inputs = []
+        for j in range(layer.num_inputs):
+            ipt = layer.get_input(j)
+            if ipt is not None:
+                layer_inputs.append(layer.get_input(j).name)
+        layer_outputs = [
+            layer.get_output(j).name for j in range(layer.num_outputs)
+        ]
+        nodes.append(
+            helper.make_node(str(layer.type),
+                             name=layer.name,
+                             inputs=layer_inputs,
+                             outputs=layer_outputs,
+                             domain="com.nvidia"))
+
+    onnx_model = helper.make_model(helper.make_graph(nodes,
+                                                     'attention',
+                                                     inputs,
+                                                     outputs,
+                                                     initializer=None),
+                                   producer_name='NVIDIA')
+    onnx.save(onnx_model, path)
+
+
+def get_engine_name(model, dtype, tp_size, pp_size, rank):
+    if pp_size == 1:
+        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size,
+                                                  pp_size, rank)
+
+
+def serialize_engine(engine, path):
+    logger.info(f'Serializing engine to {path}...')
+    tik = time.time()
+    with open(path, 'wb') as f:
+        f.write(engine)
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Engine serialized. Total time: {t}')
+
+
+def parse_arguments(cmd_args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--world_size', type=int, default=1)
+    parser.add_argument('--tp_size', type=int, default=1)
+    parser.add_argument('--pp_size', type=int, default=1)
+    parser.add_argument('--model_dir', type=str, default=None)
+    parser.add_argument('--bin_model_dir', type=str, default=None)
+    parser.add_argument('--meta_ckpt_dir', type=str, default=None)
+    parser.add_argument('--quant_ckpt_path', type=str, default=None)
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float16',
+                        choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument(
+        '--timing_cache',
+        type=str,
+        default='model.cache',
+        help=
+        'The path of to read timing cache from, will be ignored if the file does not exist'
+    )
+    parser.add_argument(
+        '--profiling_verbosity',
+        type=str,
+        default='layer_names_only',
+        choices=['layer_names_only', 'detailed', 'none'],
+        help=
+        'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.'
+    )
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument('--vocab_size', type=int, default=32000)
+    parser.add_argument('--n_layer', type=int, default=32)
+    parser.add_argument('--n_positions', type=int, default=2048)
+    parser.add_argument('--n_embd', type=int, default=4096)
+    parser.add_argument('--n_head', type=int, default=32)
+    parser.add_argument('--n_kv_head', type=int, default=None)
+    parser.add_argument('--multiple_of', type=int, default=256)
+    parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0)
+    parser.add_argument('--inter_size', type=int, default=None)
+    parser.add_argument('--hidden_act', type=str, default='silu')
+    parser.add_argument('--rms_norm_eps', type=float, default=1e-06)
+    parser.add_argument('--max_batch_size', type=int, default=8)
+    parser.add_argument('--max_input_len', type=int, default=2048)
+    parser.add_argument('--max_output_len', type=int, default=512)
+    parser.add_argument('--max_beam_width', type=int, default=1)
+    parser.add_argument('--rotary_base', type=float, default=10000.0)
+    parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None)
+    parser.add_argument('--use_gpt_attention_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_gemm_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_rmsnorm_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument('--use_lookup_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_gather_last_token_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument('--use_activation_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument('--use_elementwise_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument("--use_cast_plugin", action="store_true")
+
+    parser.add_argument('--parallel_build', default=False, action='store_true')
+    parser.add_argument('--enable_context_fmha',
+                        default=False,
+                        action='store_true')
+    parser.add_argument('--enable_context_fmha_fp32_acc',
+                        default=False,
+                        action='store_true')
+    parser.add_argument(
+        '--use_paged_context_fmha',
+        action='store_true',
+        help=
+        'Activates paged context FMHA. This mode of the context FMHA is required for chunked context, speculative decoding and reuse of KV cache blocks. Context FMHA performance is worse when this mode is on.'
+    )
+    parser.add_argument(
+        '--multi_block_mode',
+        default=False,
+        action='store_true',
+        help=
+        'Split long kv sequence into multiple blocks (applied to generation MHA kernels). \
+                        It is beneficial when batch x num_heads cannot fully utilize GPU.'
+    )
+    parser.add_argument(
+        '--disable_xqa',
+        default=False,
+        action='store_true',
+        help=
+        'Disable XQA optimization for the generation MHA. See more details in docs/gpt_attention.'
+    )
+    parser.add_argument('--visualize', default=False, action='store_true')
+    parser.add_argument('--load_by_shard',
+                        action='store_true',
+                        help='Load a pretrained model shard-by-shard.')
+    parser.add_argument('--enable_debug_output',
+                        default=False,
+                        action='store_true')
+    parser.add_argument('--gpus_per_node', type=int, default=8)
+    parser.add_argument('--builder_opt', type=int, default=None)
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='engine_outputs',
+        help=
+        'The path to save the serialized engine files, timing cache file and model configs'
+    )
+    parser.add_argument('--remove_input_padding',
+                        default=False,
+                        action='store_true')
+    parser.add_argument(
+        '--use_fused_mlp',
+        default=False,
+        action='store_true',
+        help=
+        'Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance. '
+        'For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded '
+        '(0.45734 vs 0.45755 for LLaMA-v2 7B using ammo/examples/hf/instruct_eval/mmlu.py).'
+    )
+    parser.add_argument('--enable_pos_shift',
+                        default=False,
+                        action='store_true',
+                        help='Enable position shift for streamingllm method')
+    parser.add_argument(
+        '--dense_context_fmha',
+        default=False,
+        action='store_true',
+        help=
+        'Enable dense fmha in context phase, otherwise sliding window attention.'
+        'If dense_context_fmha=False, the sliding window size is the max attention window size.'
+    )
+    # Arguments related to the quantization of the model.
+    parser.add_argument(
+        '--use_smooth_quant',
+        default=False,
+        action="store_true",
+        help=
+        'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.'
+        'See --per_channel and --per_token for finer-grained quantization options.'
+    )
+    parser.add_argument(
+        '--per_channel',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor for the GEMM\'s result. '
+        'per_channel instead uses a different static scaling factor for each channel. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_token',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale activations in the int8 range. '
+        'per_token chooses at run time, and for each token, a custom scaling factor. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_group',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale weights in the int4 range. '
+        'per_group chooses at run time, and for each group, a custom scaling factor. '
+        'The flag is built for GPTQ/AWQ quantization.')
+    parser.add_argument('--group_size',
+                        type=int,
+                        default=128,
+                        help='Group size used in GPTQ/AWQ quantization.')
+    parser.add_argument(
+        '--int8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=1,  # Meta does TP on hidden dim
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
+    )
+    parser.add_argument(
+        '--enable_fp8',
+        default=False,
+        action='store_true',
+        help='Use FP8 Linear layer for Attention QKV/Dense and MLP.')
+    parser.add_argument(
+        '--fp8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--quantized_fp8_model_path',
+        type=str,
+        default=None,
+        help='Path of a quantized model checkpoint in .npz format')
+    parser.add_argument(
+        '--use_weight_only',
+        default=False,
+        action="store_true",
+        help='Quantize weights for the various GEMMs to INT4/INT8.'
+        'See --weight_only_precision to set the precision')
+    parser.add_argument(
+        '--disable_weight_only_quant_plugin',
+        default=False,
+        action="store_true",
+        help=
+        'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--weight_only_precision',
+        const='int8',
+        type=str,
+        nargs='?',
+        default='int8',
+        choices=['int8', 'int4', 'int4_awq', 'int4_gptq'],
+        help=
+        'Define the precision for the weights when using weight-only quantization.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--quantize_lm_head',
+        default=False,
+        action="store_true",
+        help='Quantize lm_head weights as well when using int4_awq.')
+    parser.add_argument(
+        '--use_inflight_batching',
+        action="store_true",
+        default=False,
+        help="Activates inflight batching mode of gptAttentionPlugin.")
+    parser.add_argument(
+        '--paged_kv_cache',
+        action="store_true",
+        default=False,
+        help=
+        'By default we use contiguous KV cache. By setting this flag you enable paged KV cache'
+    )
+    parser.add_argument('--tokens_per_block',
+                        type=int,
+                        default=128,
+                        help='Number of tokens per block in paged KV cache')
+    parser.add_argument(
+        '--max_num_tokens',
+        type=int,
+        default=None,
+        help=
+        'Define the max number of tokens supported by the engine, note that it takes no effect if --remove_input_padding is not set'
+    )
+    parser.add_argument(
+        '--strongly_typed',
+        default=False,
+        action="store_true",
+        help=
+        'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
+    )
+    parser.add_argument(
+        '--use_custom_all_reduce',
+        action='store_true',
+        help=
+        'Activates latency-optimized algorithm for all-reduce instead of NCCL.')
+    parser.add_argument(
+        '--max_prompt_embedding_table_size',
+        type=int,
+        default=0,
+        help='Setting to a value > 0 enables support for prompt tuning.')
+    parser.add_argument(
+        '--gather_all_token_logits',
+        action='store_true',
+        default=False,
+        help='Enable both gather_context_logits and gather_generation_logits')
+    parser.add_argument('--gather_context_logits',
+                        action='store_true',
+                        default=False,
+                        help='Gather context logits')
+    parser.add_argument('--gather_generation_logits',
+                        action='store_true',
+                        default=False,
+                        help='Gather generation logits')
+    parser.add_argument(
+        '--use_lora_plugin',
+        nargs='?',
+        const=None,
+        default=False,
+        choices=['float16', 'float32', 'bfloat16'],
+        help="Activates the lora plugin which enables embedding sharing.")
+    parser.add_argument(
+        '--lora_target_modules',
+        nargs='+',
+        default=None,
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
+        help=
+        "Add lora in which modules. Only be activated when use_lora_plugin is enabled."
+    )
+    parser.add_argument('--hf_lora_dir', type=str, default=None)
+    parser.add_argument(
+        '--max_lora_rank',
+        type=int,
+        default=64,
+        help='maximum lora rank for different lora modules. '
+        'It is used to compute the workspace size of lora plugin.')
+    parser.add_argument(
+        '--moe_num_experts',
+        default=0,
+        type=int,
+        help='Specify the number of experts to use for MOE layers')
+    parser.add_argument(
+        '--moe_top_k',
+        default=0,
+        type=int,
+        help=
+        'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set'
+    )
+    parser.add_argument(
+        '--moe_tp_mode',
+        default=MoeConfig.ParallelismMode.TENSOR_PARALLEL,
+        type=int,
+        help=
+        'Controls how to distribute experts in TP. Check layers/moe.py for accepted values',
+    )
+    parser.add_argument(
+        '--moe_renorm_mode',
+        default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+        type=int,
+        help=
+        'Controls renormalization after gate logits. Check layers/moe.py for accepted values',
+    )
+    parser.add_argument("--total_build_time_target", type=float, default=0)
+
+    args = parser.parse_args(cmd_args)
+    logger.set_level(args.log_level)
+    
+    assert args.total_build_time_target >= 0, "total_build_time_target must bigger than 0"
+    
+    assert not (
+        args.use_smooth_quant and args.use_weight_only
+    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
+
+    if not args.remove_input_padding:
+        if args.use_gpt_attention_plugin:
+            logger.warning(
+                f"It is recommended to specify --remove_input_padding when using GPT attention plugin"
+            )
+
+    if args.use_inflight_batching:
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
+
+    if args.use_smooth_quant:
+        args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
+                                                     args.per_channel)
+    elif args.use_weight_only:
+        args.quant_mode = QuantMode.from_description(
+            quantize_weights=True,
+            quantize_activations=False,
+            per_token=False,
+            per_channel=False,
+            per_group=args.per_group,
+            use_int4_weights="int4" in args.weight_only_precision)
+    else:
+        args.quant_mode = QuantMode(0)
+
+    if args.int8_kv_cache:
+        args.quant_mode = args.quant_mode.set_int8_kv_cache()
+    elif args.fp8_kv_cache:
+        args.quant_mode = args.quant_mode.set_fp8_kv_cache()
+    if args.enable_fp8:
+        args.quant_mode = args.quant_mode.set_fp8_qdq()
+
+    if args.rotary_scaling is not None:
+        assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin."
+        rotary_scaling = {
+            "type": args.rotary_scaling[0],
+            "factor": float(args.rotary_scaling[1])
+        }
+        assert rotary_scaling["type"] in ["linear", "dynamic"]
+        assert rotary_scaling["factor"] > 1.0
+        args.rotary_scaling = rotary_scaling
+
+    if args.model_dir is not None:
+        hf_config = LlamaConfig.from_pretrained(args.model_dir)
+        if hf_config.model_type == "llava":
+            # LLaVA = Vision model + Llama LLM
+            # We load a llava config and use its' text config as llama config
+            hf_config = LlavaConfig.from_pretrained(args.model_dir).text_config
+            hf_config.model_type = "llava"  # Replace llama with llava
+
+        args.inter_size = hf_config.intermediate_size  # override the inter_size for LLaMA
+        args.n_embd = hf_config.hidden_size
+        args.n_head = hf_config.num_attention_heads
+        if hasattr(hf_config, "num_key_value_heads"):
+            args.n_kv_head = hf_config.num_key_value_heads
+        
+        # hf_config.num_hidden_layers = 1  # only for debug
+        args.n_layer = hf_config.num_hidden_layers
+        args.n_positions = hf_config.max_position_embeddings
+        args.vocab_size = hf_config.vocab_size if hf_config.vocab_size is not None else args.vocab_size
+        args.hidden_act = hf_config.hidden_act
+        args.rms_norm_eps = hf_config.rms_norm_eps
+        # These attributes only exists with Mixtral, for the moment
+        args.moe_num_experts = getattr(hf_config, "num_local_experts",
+                                       args.moe_num_experts)
+        args.moe_top_k = getattr(hf_config, "num_experts_per_tok",
+                                 args.moe_top_k)
+        args.rotary_base = getattr(hf_config, "rope_theta", args.rotary_base)
+        args.model_type = hf_config.model_type
+        if hf_config.model_type == "mixtral":
+            # HF LLaMA-type models are implicitly using gated activation.
+            # With our MoE implementation, we must make it explicit
+            args.hidden_act = "swiglu"
+
+    elif args.meta_ckpt_dir is not None:
+        with open(Path(args.meta_ckpt_dir, "params.json")) as fp:
+            meta_config: dict = json.load(fp)
+        args.n_embd = meta_config["dim"]
+        args.n_head = meta_config["n_heads"]
+        args.n_layer = meta_config["n_layers"]
+        args.n_kv_head = meta_config.get("n_kv_heads", args.n_head)
+        if "hidden_dim" in meta_config:
+            args.inter_size = meta_config["hidden_dim"]
+        else:
+            args.multiple_of = meta_config.get("multiple_of", 1)
+            n_embd = int(4 * args.n_embd * 2 / 3)
+            args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1)
+            args.inter_size = args.multiple_of * (
+                (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1)
+                // args.multiple_of)
+        args.rms_norm_eps = meta_config["norm_eps"]
+        args.moe_num_experts = meta_config.get("moe", {}).get("num_experts", 0)
+        args.moe_top_k = meta_config.get("moe", {}).get("num_experts_per_tok",
+                                                        0)
+    elif args.bin_model_dir is not None:
+        n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head = parse_bin_config(
+            Path(args.bin_model_dir) / "config.ini")
+        args.inter_size = inter_size  # override the inter_size for LLaMA
+        args.n_kv_head = n_kv_head
+        args.n_embd = n_embd
+        args.n_head = n_head
+        args.n_layer = n_layer
+        args.n_positions = n_positions
+        args.vocab_size = vocab_size if args.vocab_size is None else args.vocab_size
+        args.hidden_act = hidden_act
+        args.rms_norm_eps = 1e-06
+        logger.warning("Set rms_norm_eps to 1e-06 directly.")
+    if args.n_kv_head is None:
+        args.n_kv_head = args.n_head
+    elif args.n_kv_head != args.n_head:
+        assert (args.n_head % args.n_kv_head) == 0, \
+            "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
+        assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \
+            "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \
+            "the tensor parallelism size to be divisible by the number of K/V heads."
+
+    hf_modules_to_trtllm_modules = {
+        "q_proj": "attn_q",
+        "k_proj": "attn_k",
+        "v_proj": "attn_v",
+        "o_proj": "attn_dense",
+        "gate_proj": "mlp_h_to_4h",
+        "down_proj": "mlp_4h_to_h",
+        "up_proj": "mlp_gate"
+    }  # lora modules on llama
+
+    trtllm_modules_to_hf_modules = {
+        "attn_q": "q_proj",
+        "attn_k": "k_proj",
+        "attn_v": "v_proj",
+        "attn_dense": "o_proj",
+        "mlp_h_to_4h": "gate_proj",
+        "mlp_4h_to_h": "down_proj",
+        "mlp_gate": "up_proj",
+    }
+
+    lora_config = LoraConfig.from_hf(args.hf_lora_dir,
+                                     hf_modules_to_trtllm_modules,
+                                     trtllm_modules_to_hf_modules)
+
+    if lora_config.is_valid:
+        if args.lora_target_modules is None:
+            args.lora_target_modules = lora_config.lora_target_modules
+        # the lora checkpoint might finetune the embedding
+        if lora_config.vocab_size != 0:
+            args.vocab_size = lora_config.vocab_size
+
+    args.lora_config = lora_config
+
+    if args.weight_only_precision == 'int4_awq':
+        inter_alignment = args.tp_size * 128
+        if args.inter_size % inter_alignment != 0:
+            args.inter_size = int((args.inter_size + inter_alignment - 1) /
+                                  inter_alignment) * inter_alignment
+            logger.info("To use awq we pad intermediate_size to {}.".format(
+                args.inter_size))
+
+        if args.quantize_lm_head:
+            vocab_alignment = args.tp_size * 64
+            if args.vocab_size % vocab_alignment != 0:
+                args.vocab_size = int((args.vocab_size + vocab_alignment - 1) /
+                                      vocab_alignment) * vocab_alignment
+                logger.info("To use awq we pad vocab_size to {}.".format(
+                    args.vocab_size))
+
+    assert args.pp_size * args.tp_size == args.world_size
+
+    args.max_num_tokens = check_max_num_tokens(
+        max_num_tokens=args.max_num_tokens,
+        max_batch_size=args.max_batch_size,
+        max_input_len=args.max_input_len,
+        remove_input_padding=args.remove_input_padding)
+
+    assert (math.log2(args.tokens_per_block).is_integer()
+            ), "tokens_per_block must be power of 2"
+    if args.enable_context_fmha or args.enable_context_fmha_fp32_acc:
+        assert (args.tokens_per_block >=
+                128), "Context fMHA requires >= 128 tokens per block"
+
+    if args.inter_size is None:
+        # this should not be need when loading a real model
+        # but it is helpful when creating a dummy model without loading any real weights
+        n_embd = int(4 * args.n_embd * 2 / 3)
+        args.inter_size = args.multiple_of * (
+            (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) //
+            args.multiple_of)
+        logger.info(f"Setting inter_size to {args.inter_size}.")
+
+    if args.enable_pos_shift:
+        assert args.use_gpt_attention_plugin, "Position shift is only support in the gpt attention plugin."
+        assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc
+
+    if args.moe_num_experts and args.moe_top_k == 0:
+        args.moe_top_k = 1
+    args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k,
+                                args.moe_tp_mode,
+                                args.moe_renorm_mode).validate()
+
+    if args.gather_all_token_logits:
+        args.gather_context_logits = True
+        args.gather_generation_logits = True
+
+    return args
+
+
+def get_model_object(args, mapping, trt_dtype=None):
+    if trt_dtype is None:
+        trt_dtype = str_dtype_to_trt(args.dtype)
+    # Initialize Module
+    logger.debug("[Python]llama exampels, Initialize tensorrt_llm.models.LLaMAForCausalLM....")
+    tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(
+        num_layers=args.n_layer,
+        num_heads=args.n_head,
+        num_kv_heads=args.n_kv_head,
+        hidden_size=args.n_embd,
+        vocab_size=args.vocab_size,
+        hidden_act=args.hidden_act,
+        max_position_embeddings=args.n_positions,
+        dtype=trt_dtype,
+        mlp_hidden_size=args.inter_size,
+        position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+        mapping=mapping,
+        rotary_base=args.rotary_base,
+        rotary_scaling=args.rotary_scaling,
+        use_parallel_embedding=args.use_parallel_embedding,
+        embedding_sharding_dim=args.embedding_sharding_dim,
+        quant_mode=args.quant_mode,
+        rms_norm_eps=args.rms_norm_eps,
+        use_fused_mlp=args.use_fused_mlp,
+        use_prompt_tuning=args.max_prompt_embedding_table_size > 0,
+        enable_pos_shift=args.enable_pos_shift,
+        dense_context_fmha=args.dense_context_fmha,
+        moe_config=args.moe_config,
+        max_lora_rank=args.max_lora_rank)
+    quantize_kwargs = {}
+    if args.use_smooth_quant or args.use_weight_only:
+        if args.weight_only_precision == 'int4_awq':
+            exclude_modules = ['lm_head'] if not args.quantize_lm_head else []
+            quantize_kwargs = {
+                "group_size": args.group_size,
+                "zero": False,
+                "pre_quant_scale": True,
+                "exclude_modules": exclude_modules,
+            }
+        elif args.weight_only_precision == 'int4_gptq':
+            quantize_kwargs = {
+                "group_size": args.group_size,
+                "zero": True,
+                "pre_quant_scale": False,
+            }
+    elif args.enable_fp8 or args.fp8_kv_cache:
+        logger.info(f'Loading scaling factors from '
+                    f'{args.quantized_fp8_model_path}')
+        quant_scales = get_scaling_factors(args.quantized_fp8_model_path,
+                                           num_layers=args.n_layer,
+                                           quant_mode=args.quant_mode)
+        quantize_kwargs = {"quant_scales": quant_scales}
+
+    if args.use_weight_only and args.moe_config.has_moe():
+        if 'exclude_modules' in quantize_kwargs:
+            quantize_kwargs['exclude_modules'].append('router')
+        else:
+            quantize_kwargs['exclude_modules'] = ['lm_head', 'router']
+
+    tensorrt_llm_llama = quantize_model(tensorrt_llm_llama, args.quant_mode,
+                                        **quantize_kwargs)
+    if args.per_group:
+        if args.weight_only_precision == 'int4_awq':
+            load_from_awq_llama(tensorrt_llm_llama=tensorrt_llm_llama,
+                                quant_ckpt_path=args.quant_ckpt_path,
+                                quantize_lm_head=args.quantize_lm_head,
+                                mapping=mapping,
+                                dtype=args.dtype,
+                                bin_model_dir=args.bin_model_dir)
+        else:
+            load_from_gptq_llama(tensorrt_llm_llama=tensorrt_llm_llama,
+                                 quant_ckpt_path=args.quant_ckpt_path,
+                                 mapping=mapping,
+                                 dtype=args.dtype,
+                                 bin_model_dir=args.bin_model_dir)
+    elif args.meta_ckpt_dir is not None:
+        load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping,
+                             args.dtype)
+    elif args.model_dir is not None:
+        logger.info(f'Loading HF LLaMA ... from {args.model_dir}')
+        tik = time.time()
+        if not args.load_by_shard:
+            if args.model_type == "llava":
+                hf_llava = LlavaForConditionalGeneration.from_pretrained(
+                    args.model_dir, torch_dtype="auto")
+                hf_llama = hf_llava.language_model
+            else:
+                hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM
+                hf_llama = hf_model.from_pretrained(
+                    args.model_dir,
+                    device_map={
+                        "model": "cpu",
+                        "lm_head": "cpu",
+                        "embed_tokens": "cpu",
+                        "layers": "cpu",
+                        "norm": "cpu",
+                    },  # Load to CPU memory
+                    torch_dtype='auto',
+                )
+            use_gemm_woq_plugin = not args.disable_weight_only_quant_plugin
+            # hf_llama.config.num_hidden_layers = 1  # only for debug
+            load_from_hf_llama(tensorrt_llm_llama,
+                               hf_llama,
+                               mapping=mapping,
+                               dtype=args.dtype,
+                               use_gemm_woq_plugin=use_gemm_woq_plugin,
+                               lora_config=args.lora_config)
+            del hf_llama
+        else:
+            load_from_hf_checkpoint(tensorrt_llm_llama,
+                                    args.model_dir,
+                                    mapping,
+                                    dtype=args.dtype,
+                                    lora_config=args.lora_config)
+        tok = time.time()
+        t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+        logger.info(f'HF LLaMA loaded. Total time: {t}')
+
+    elif args.bin_model_dir is not None:
+        load_from_binary(tensorrt_llm_llama,
+                         args.bin_model_dir,
+                         mapping,
+                         fp16=(args.dtype == 'float16'),
+                         multi_query_mode=(args.n_kv_head != args.n_head))
+
+    return tensorrt_llm_llama
+
+
+def update_plugin_configs(args, network):
+    if args.use_gpt_attention_plugin:
+        network.plugin_config.set_gpt_attention_plugin(
+            dtype=args.use_gpt_attention_plugin)
+    if args.use_gemm_plugin:
+        if not args.enable_fp8:
+            network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
+        else:
+            logger.info(
+                "Gemm plugin does not support FP8. Disabled Gemm plugin.")
+    if args.use_rmsnorm_plugin:
+        network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin)
+    if args.use_lora_plugin:
+        network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin)
+    if args.use_lookup_plugin:
+        network.plugin_config.set_lookup_plugin(dtype=args.use_lookup_plugin)
+    if args.use_gather_last_token_plugin:
+        network.plugin_config.set_gather_last_token_plugin(dtype=args.use_gather_last_token_plugin)
+    if args.use_activation_plugin:
+        network.plugin_config.set_activation_plugin(dtype=args.use_activation_plugin)
+    if args.use_elementwise_plugin:
+        network.plugin_config.set_elementwise_plugin(dtype=args.use_elementwise_plugin)
+    if args.use_cast_plugin:
+        network.plugin_config.set_cast_plugin()
+
+    # Quantization plugins.
+    if args.use_smooth_quant:
+        network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype)
+        network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype)
+        network.plugin_config.set_quantize_tensor_plugin()
+        network.plugin_config.set_quantize_per_token_plugin()
+    assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
+    if args.enable_context_fmha:
+        network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
+    if args.enable_context_fmha_fp32_acc:
+        network.plugin_config.set_context_fmha(
+            ContextFMHAType.enabled_with_fp32_acc)
+    if args.multi_block_mode:
+        network.plugin_config.enable_mmha_multi_block_mode()
+    if not args.disable_xqa:
+        network.plugin_config.enable_xqa_optimization()
+
+    if args.use_weight_only and not args.disable_weight_only_quant_plugin:
+        if args.per_group:
+            network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin(
+                dtype=args.dtype)
+        else:
+            network.plugin_config.set_weight_only_quant_matmul_plugin(
+                dtype=args.dtype)
+    if args.world_size > 1:
+        network.plugin_config.set_nccl_plugin(args.dtype,
+                                              args.use_custom_all_reduce)
+    if args.remove_input_padding:
+        network.plugin_config.enable_remove_input_padding()
+    if args.paged_kv_cache:
+        network.plugin_config.enable_paged_kv_cache(args.tokens_per_block)
+    return
+
+
+def build_rank_engine(builder: Builder,
+                      builder_config: tensorrt_llm.builder.BuilderConfig,
+                      engine_name, rank, args):
+    '''
+       @brief: Build the engine on the given rank.
+       @param rank: The rank to build the engine.
+       @param args: The cmd line arguments.
+       @return: The built engine.
+    '''
+    dtype = str_dtype_to_trt(args.dtype)
+    mapping = Mapping(world_size=args.world_size,
+                      rank=rank,
+                      tp_size=args.tp_size,
+                      pp_size=args.pp_size)
+
+    assert args.n_layer % args.pp_size == 0, \
+        f"num_layers {args.n_layer} must be a multiple of pipeline parallelism size {args.pp_size}"
+
+    # FIXME (Not Support libnvidia-ml.so)
+    # profiler.print_memory_usage(f'Rank {rank} Engine build starts')
+    # Initialize Module
+    tensorrt_llm_llama = get_model_object(args,
+                                          mapping=mapping,
+                                          trt_dtype=dtype)
+    
+    # FIXME (Not Support libnvidia-ml.so)
+    # profiler.print_memory_usage(f'Rank {rank} model weight loaded.')
+
+    # Module -> Network
+    logger.debug("[Python]llama exampels, convert module to network....")
+    network = builder.create_network()
+    network.trt_network.name = engine_name
+    update_plugin_configs(args, network)
+
+    if args.use_paged_context_fmha:
+        assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc, "context fmha must be enabled"
+        network.plugin_config.set_paged_context_fmha()
+        
+    logger.debug(f"[Python]llama exampels, network.plugin_config: \n{network.plugin_config}")
+    with net_guard(network):
+        # Prepare
+        network.set_named_parameters(tensorrt_llm_llama.named_parameters())
+
+        # Forward
+        inputs = tensorrt_llm_llama.prepare_inputs(
+            max_batch_size=args.max_batch_size,
+            max_input_len=args.max_input_len,
+            max_seq_len=args.max_input_len + args.max_output_len,
+            use_cache=True,
+            max_beam_width=args.max_beam_width,
+            max_num_tokens=args.max_num_tokens,
+            prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+            gather_context_logits=args.gather_context_logits,
+            gather_generation_logits=args.gather_generation_logits,
+            lora_target_modules=args.lora_target_modules)
+        logger.info(f"[Python]llama exampels, forward....\n")
+        tensorrt_llm_llama(*inputs)
+        logger.info(f"[Python]llama exampels, forward finished\n")
+        if args.enable_debug_output:
+            # mark intermediate nodes' outputs
+            for k, v in tensorrt_llm_llama.named_network_outputs():                
+                logger.debug(f"enable_debug_output, debug tensor name: {k}")
+                v = v.trt_tensor
+                v.name = k
+                network.trt_network.mark_output(v)
+                v.dtype = dtype
+        if args.visualize:
+            model_path = os.path.join(args.output_dir, 'test.onnx')
+            to_onnx(network.trt_network, model_path)
+            
+    logger.debug("[Python]llama examples, tensorrt_llm.graph_rewriting.optimize....")
+    tensorrt_llm.graph_rewriting.optimize(network)
+
+    engine = None
+
+    # Network -> Engine
+    logger.debug("[Python]llama examples, builder.build_engine....")
+    engine = builder.build_engine(network, builder_config)
+    if rank == 0:
+        config_path = os.path.join(args.output_dir, 'config.json')
+        builder.save_config(builder_config, config_path)
+
+    return engine
+
+
+def get_builder_config_namespace(args, cache):
+    # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT
+    # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ)
+    # OR INT8 KV cache is set to contiguous (without paged KV cache enabled).
+    int8_trt_flag = (args.quant_mode.has_act_or_weight_quant()
+                     and not args.quant_mode.has_per_group_scaling()) or (
+                         not args.paged_kv_cache
+                         and args.quant_mode.has_int8_kv_cache())
+    config = argparse.Namespace(
+        name=MODEL_NAME,
+        precision=args.dtype,
+        timing_cache=args.timing_cache if cache is None else cache,
+        profiling_verbosity=args.profiling_verbosity,
+        tensor_parallel=args.tp_size,
+        pipeline_parallel=args.pp_size,
+        parallel_build=args.parallel_build,
+        num_layers=args.n_layer,
+        num_heads=args.n_head,
+        num_kv_heads=args.n_kv_head,
+        hidden_size=args.n_embd,
+        vocab_size=args.vocab_size,
+        hidden_act=args.hidden_act,
+        max_position_embeddings=args.n_positions,
+        max_batch_size=args.max_batch_size,
+        max_beam_width=args.max_beam_width,
+        max_input_len=args.max_input_len,
+        max_output_len=args.max_output_len,
+        max_num_tokens=args.max_num_tokens,
+        int8=int8_trt_flag,
+        quant_mode=args.quant_mode,
+        strongly_typed=args.strongly_typed,
+        opt_level=args.builder_opt,
+        max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+        gather_context_logits=args.gather_context_logits,
+        gather_generation_logits=args.gather_generation_logits,
+        lora_target_modules=args.lora_target_modules,
+        mlp_hidden_size=args.inter_size,
+        hf_modules_to_trtllm_modules=args.lora_config.
+        hf_modules_to_trtllm_modules,
+        trtllm_modules_to_hf_modules=args.lora_config.
+        trtllm_modules_to_hf_modules,
+    )
+    return config
+
+
+def build(rank, args):
+    torch.cuda.set_device(rank % args.gpus_per_node)
+    logger.set_level(args.log_level)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # when doing serializing build, all ranks share one engine
+    builder = Builder()
+    cache = None
+    for cur_rank in range(args.world_size):
+        # skip other ranks if parallel_build is enabled
+        if args.parallel_build and cur_rank != rank:
+            continue
+        tik = time.time()
+
+        # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT
+        # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ)
+        # OR INT8 KV cache is set to contiguous (without paged KV cache enabled).
+        int8_trt_flag = (args.quant_mode.has_act_or_weight_quant()
+                         and not args.quant_mode.has_per_group_scaling()) or (
+                             not args.paged_kv_cache
+                             and args.quant_mode.has_int8_kv_cache())
+        builder_config = builder.create_builder_config(
+            **vars(get_builder_config_namespace(args, cache)))
+        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size,
+                                      args.pp_size, cur_rank)
+        logger.debug("[Python]llama example, build_rank_engine....")
+        engine = build_rank_engine(builder, builder_config, engine_name,
+                                   cur_rank, args)
+        assert engine is not None, f'Failed to build engine for rank {cur_rank}'
+
+        local_num_kv_heads = (args.n_kv_head + args.world_size -
+                              1) // args.world_size
+        kv_dtype = str_dtype_to_trt(args.dtype)
+        if args.quant_mode.has_int8_kv_cache():
+            kv_dtype = str_dtype_to_trt('int8')
+        elif args.quant_mode.has_fp8_kv_cache():
+            kv_dtype = str_dtype_to_trt('fp8')
+            
+        # FIXME (Not Support libnvidia-ml.so)
+        # profiler.check_gpt_mem_usage(
+        #     engine=engine,
+        #     kv_dtype=kv_dtype,
+        #     use_gpt_attention_plugin=args.use_gpt_attention_plugin,
+        #     paged_kv_cache=args.paged_kv_cache,
+        #     max_batch_size=args.max_batch_size,
+        #     max_beam_width=args.max_beam_width,
+        #     max_seq_len=args.max_input_len + args.max_output_len,
+        #     local_num_kv_heads=local_num_kv_heads,
+        #     head_size=args.n_embd / args.n_head,
+        #     num_layers=args.n_layer)
+
+        if cur_rank == 0:
+            # Use in-memory timing cache for multiple builder passes.
+            if not args.parallel_build:
+                cache = builder_config.trt_builder_config.get_timing_cache()
+
+        serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+        del engine
+        # FIXME (Not Support libnvidia-ml.so)
+        # profiler.print_memory_usage(f'Rank {cur_rank} Engine serialized')
+
+        tok = time.time()
+        t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+        logger.info(
+            f'Rank {cur_rank} Engine build time: {t} - {tok - tik} (sec)')
+
+    if rank == 0:
+        ok = builder.save_timing_cache(
+            builder_config, os.path.join(args.output_dir, "model.cache"))
+        assert ok, "Failed to save timing cache."
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    print(args)
+    tik = time.time()
+    if args.parallel_build and args.world_size > 1 and \
+            torch.cuda.device_count() >= args.world_size:
+        logger.warning(
+            f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.'
+        )
+        mp.spawn(build, nprocs=args.world_size, args=(args, ))
+    else:
+        args.parallel_build = False
+        logger.info('Serially build TensorRT engines.')
+        build(0, args)
+
+    tok = time.time()
+    build_engine_time = tok - tik
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Total time of building all {args.world_size} engines: {t}')
+    
+    if args.total_build_time_target != 0:
+        status = build_engine_time <= args.total_build_time_target
+        if status:
+            print("successful.")
+        else:
+            print(f"Build engine time check failed! Target: {args.total_build_time_target}, Actual: {build_engine_time}")
+        sys.exit(int(not status))
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/run.py b/models/nlp/large_language_model/llama2-7b/trtllm/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..3899ec9d55a33bca6eeeac4840353345467b474d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/run.py
@@ -0,0 +1,539 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import csv
+from pathlib import Path
+import sys
+import time
+
+import numpy as np
+import torch
+import tensorrt_llm
+import tensorrt_llm.profiler
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
+
+from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
+                   load_tokenizer, read_model_name, throttle_generator)
+
+if PYTHON_BINDINGS:
+    from tensorrt_llm.runtime import ModelRunnerCpp
+
+
+def parse_arguments(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--max_output_len', type=int, required=True)
+    parser.add_argument(
+        '--max_attention_window_size',
+        type=int,
+        default=None,
+        help=
+        'The attention window size that controls the sliding window attention / cyclic kv cache behaviour'
+    )
+    parser.add_argument('--sink_token_length',
+                        type=int,
+                        default=None,
+                        help='The sink token length.')
+    parser.add_argument('--log_level', type=str, default='error')
+    parser.add_argument('--engine_dir', type=str, default='engine_outputs')
+    parser.add_argument('--use_py_session',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to use Python runtime session")
+    parser.add_argument(
+        '--input_text',
+        type=str,
+        nargs='+',
+        default=["Born in north-east France, Soyer trained as a"])
+    parser.add_argument(
+        '--no_prompt_template',
+        dest='use_prompt_template',
+        default=True,
+        action='store_false',
+        help=
+        "Whether or not to use default prompt template to wrap the input text.")
+    parser.add_argument(
+        '--input_file',
+        type=str,
+        help=
+        'CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default=None)
+    parser.add_argument('--max_input_length', type=int, default=923)
+    parser.add_argument('--output_csv',
+                        type=str,
+                        help='CSV file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument('--output_npy',
+                        type=str,
+                        help='Numpy file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument(
+        '--output_logits_npy',
+        type=str,
+        help=
+        'Numpy file where the generation logits are stored. Use only when num_beams==1',
+        default=None)
+    parser.add_argument('--tokenizer_dir',
+                        help="HF tokenizer config path",
+                        default='gpt2')
+    parser.add_argument(
+        '--tokenizer_type',
+        help=
+        'Specify that argument when providing a .model file as the tokenizer_dir. '
+        'It allows AutoTokenizer to instantiate the correct tokenizer type.')
+    parser.add_argument('--vocab_file',
+                        help="Used for sentencepiece tokenizers")
+    parser.add_argument('--num_beams',
+                        type=int,
+                        help="Use beam search if num_beams >1",
+                        default=1)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_k', type=int, default=1)
+    parser.add_argument('--top_p', type=float, default=0.0)
+    parser.add_argument('--length_penalty', type=float, default=1.0)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
+    parser.add_argument('--presence_penalty', type=float, default=0.0)
+    parser.add_argument('--frequency_penalty', type=float, default=0.0)
+    parser.add_argument('--debug_mode',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to turn on the debug mode")
+    parser.add_argument('--no_add_special_tokens',
+                        dest='add_special_tokens',
+                        default=True,
+                        action='store_false',
+                        help="Whether or not to add special tokens")
+    parser.add_argument('--streaming', default=False, action='store_true')
+    parser.add_argument('--streaming_interval',
+                        type=int,
+                        help="How often to return tokens when streaming.",
+                        default=5)
+    parser.add_argument(
+        '--prompt_table_path',
+        type=str,
+        help="Path to .npy file, exported by nemo_prompt_convert.py")
+    parser.add_argument(
+        '--prompt_tasks',
+        help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0")
+    parser.add_argument('--lora_dir',
+                        type=str,
+                        default=None,
+                        nargs="+",
+                        help="The directory of LoRA weights")
+    parser.add_argument(
+        '--lora_task_uids',
+        type=str,
+        default=None,
+        nargs="+",
+        help="The list of LoRA task uids; use -1 to disable the LoRA module")
+    parser.add_argument('--lora_ckpt_source',
+                        type=str,
+                        default="hf",
+                        choices=["hf", "nemo"],
+                        help="The source of lora checkpoint.")
+    parser.add_argument(
+        '--num_prepend_vtokens',
+        nargs="+",
+        type=int,
+        help="Number of (default) virtual tokens to prepend to each sentence."
+        " For example, '--num_prepend_vtokens=10' will prepend the tokens"
+        " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.")
+    parser.add_argument(
+        '--run_profiling',
+        default=False,
+        action='store_true',
+        help="Run several 10 iterations to profile the inference latencies.")
+    parser.add_argument(
+        '--medusa_choices',
+        type=str,
+        default=None,
+        help="Medusa choice to use, if not none, will use Medusa decoding."
+        "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
+    )
+    parser.add_argument('--target_load_engine_time',
+                        type=float,
+                        default=0)
+    parser.add_argument('--target_qps',
+                        type=float,
+                        default=0)
+
+    return parser.parse_args(args=args)
+
+
+def parse_input(tokenizer,
+                input_text=None,
+                prompt_template=None,
+                input_file=None,
+                add_special_tokens=True,
+                max_input_length=923,
+                pad_id=None,
+                num_prepend_vtokens=[],
+                model_name=None):
+    if pad_id is None:
+        pad_id = tokenizer.pad_token_id
+
+    batch_input_ids = []
+    if input_file is None:
+        for curr_text in input_text:
+            if prompt_template is not None:
+                curr_text = prompt_template.format(input_text=curr_text)
+            input_ids = tokenizer.encode(curr_text,
+                                         add_special_tokens=add_special_tokens,
+                                         truncation=True,
+                                         max_length=max_input_length)
+            batch_input_ids.append(input_ids)
+    else:
+        if input_file.endswith('.csv'):
+            with open(input_file, 'r') as csv_file:
+                csv_reader = csv.reader(csv_file, delimiter=',')
+                for line in csv_reader:
+                    input_ids = np.array(line, dtype='int32')
+                    batch_input_ids.append(input_ids[-max_input_length:])
+        elif input_file.endswith('.npy'):
+            inputs = np.load(input_file)
+            for row in inputs:
+                input_ids = row[row != pad_id]
+                batch_input_ids.append(input_ids[-max_input_length:])
+        elif input_file.endswith('.txt'):
+            with open(input_file, 'r', encoding='utf-8',
+                      errors='replace') as txt_file:
+                input_text = txt_file.read()
+                input_ids = tokenizer.encode(
+                    input_text,
+                    add_special_tokens=add_special_tokens,
+                    truncation=True,
+                    max_length=max_input_length)
+                batch_input_ids.append(input_ids)
+        else:
+            print('Input file format not supported.')
+            raise SystemExit
+
+    if num_prepend_vtokens:
+        assert len(num_prepend_vtokens) == len(batch_input_ids)
+        base_vocab_size = tokenizer.vocab_size - len(
+            tokenizer.special_tokens_map.get('additional_special_tokens', []))
+        for i, length in enumerate(num_prepend_vtokens):
+            batch_input_ids[i] = list(
+                range(base_vocab_size,
+                      base_vocab_size + length)) + batch_input_ids[i]
+    if model_name == 'glm_10b':
+        for ids in batch_input_ids:
+            ids.append(tokenizer.sop_token_id)
+    batch_input_ids = [
+        torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
+    ]
+    return batch_input_ids
+
+
+def print_output(tokenizer,
+                 output_ids,
+                 input_lengths,
+                 sequence_lengths,
+                 output_csv=None,
+                 output_npy=None,
+                 context_logits=None,
+                 generation_logits=None,
+                 output_logits_npy=None):
+    batch_size, num_beams, _ = output_ids.size()
+    if output_csv is None and output_npy is None:
+        for batch_idx in range(batch_size):
+            inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
+            )
+            input_text = tokenizer.decode(inputs)
+            print(f'Input [Text {batch_idx}]: \"{input_text}\"')
+            for beam in range(num_beams):
+                output_begin = input_lengths[batch_idx]
+                output_end = sequence_lengths[batch_idx][beam]
+                outputs = output_ids[batch_idx][beam][
+                    output_begin:output_end].tolist()
+                output_text = tokenizer.decode(outputs)
+                print(
+                    f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"')
+
+    output_ids = output_ids.reshape((-1, output_ids.size(2)))
+
+    if output_csv is not None:
+        output_file = Path(output_csv)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = output_ids.tolist()
+        with open(output_file, 'w') as csv_file:
+            writer = csv.writer(csv_file, delimiter=',')
+            writer.writerows(outputs)
+
+    if output_npy is not None:
+        output_file = Path(output_npy)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
+        np.save(output_file, outputs)
+
+    # Save context logits
+    if context_logits is not None and output_logits_npy is not None:
+        context_logits = torch.cat(context_logits, axis=0)
+        vocab_size_padded = context_logits.shape[-1]
+        context_logits = context_logits.reshape([1, -1, vocab_size_padded])
+
+        output_context_logits_npy = output_logits_npy.split(
+            '.npy')[0] + "_context"
+        output_context_logits_file = Path(output_context_logits_npy)
+        context_outputs = np.array(
+            context_logits.squeeze(0).cpu().contiguous(),
+            dtype='float32')  # [promptLengthSum, vocabSize]
+        np.save(output_context_logits_file, context_outputs)
+
+    # Save generation logits
+    if generation_logits is not None and output_logits_npy is not None and num_beams == 1:
+        output_generation_logits_npy = output_logits_npy.split(
+            '.npy')[0] + "_generation"
+        output_generation_logits_file = Path(output_generation_logits_npy)
+        generation_outputs = np.array(generation_logits.cpu().contiguous(),
+                                      dtype='float32')
+        np.save(output_generation_logits_file, generation_outputs)
+
+
+def check_status(args, load_engine_time, qps):
+    print("==================== check status ====================")
+    successful = True
+    if args.target_load_engine_time != 0 and load_engine_time > args.target_load_engine_time:
+        print(f"Load engine time check failed! Target: {args.target_load_engine_time}, Actual: {load_engine_time}")
+        successful = False
+    if args.target_qps != 0 and qps < args.target_qps:
+        print(f"Performance check failed! Target: {args.target_qps}, Actual: {qps}")
+        successful = False
+    return successful
+
+
+def main(args):
+    runtime_rank = tensorrt_llm.mpi_rank()
+    logger.set_level(args.log_level)
+
+    model_name = read_model_name(args.engine_dir)
+    if args.tokenizer_dir is None:
+        args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name]
+
+    tokenizer, pad_id, end_id = load_tokenizer(
+        tokenizer_dir=args.tokenizer_dir,
+        vocab_file=args.vocab_file,
+        model_name=model_name,
+        tokenizer_type=args.tokenizer_type,
+    )
+
+    # # An example to stop generation when the model generate " London" on first sentence, " eventually became" on second sentence
+    # stop_words_list = [[" London"], ["eventually became"]]
+    # stop_words_list = tensorrt_llm.runtime.to_word_list_format(stop_words_list, tokenizer)
+    # stop_words_list = torch.Tensor(stop_words_list).to(torch.int32).to("cuda").contiguous()
+    stop_words_list = None
+
+    # # An example to prevent generating " chef" on first sentence, " eventually" and " chef before" on second sentence
+    # bad_words_list = [[" chef"], [" eventually, chef before"]]
+    # bad_words_list = tensorrt_llm.runtime.to_word_list_format(bad_words_list, tokenizer)
+    # bad_words_list = torch.Tensor(bad_words_list).to(torch.int32).to("cuda").contiguous()
+    bad_words_list = None
+
+    prompt_template = None
+    if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES:
+        prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name]
+    batch_input_ids = parse_input(tokenizer=tokenizer,
+                                  input_text=args.input_text,
+                                  prompt_template=prompt_template,
+                                  input_file=args.input_file,
+                                  add_special_tokens=args.add_special_tokens,
+                                  max_input_length=args.max_input_length,
+                                  pad_id=pad_id,
+                                  num_prepend_vtokens=args.num_prepend_vtokens,
+                                  model_name=model_name)
+    input_lengths = [x.size(0) for x in batch_input_ids]
+
+    if not PYTHON_BINDINGS and not args.use_py_session:
+        logger.warning(
+            "Python bindings of C++ session is unavailable, fallback to Python session."
+        )
+        args.use_py_session = True
+    if args.debug_mode and not args.use_py_session:
+        logger.warning(
+            "Debug mode is not supported in C++ session for now, fallback to Python session."
+        )
+        args.use_py_session = True
+    runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
+    runner_kwargs = dict(engine_dir=args.engine_dir,
+                         lora_dir=args.lora_dir,
+                         rank=runtime_rank,
+                         debug_mode=args.debug_mode,
+                         lora_ckpt_source=args.lora_ckpt_source)
+    if args.medusa_choices is not None:
+        args.medusa_choices = ast.literal_eval(args.medusa_choices)
+        assert args.use_py_session, "Medusa is only supported by py_session"
+        assert args.temperature == 0, "Medusa should use temperature == 0"
+        assert args.num_beams == 1, "Medusa should use num_beams == 1"
+        runner_kwargs.update(medusa_choices=args.medusa_choices)
+    if not args.use_py_session:
+        runner_kwargs.update(
+            max_batch_size=len(batch_input_ids),
+            max_input_len=max(input_lengths),
+            max_output_len=args.max_output_len,
+            max_beam_width=args.num_beams,
+            max_attention_window_size=args.max_attention_window_size,
+            sink_token_length=args.sink_token_length,
+        )
+    runner = runner_cls.from_dir(**runner_kwargs)
+
+    torch.cuda.synchronize()
+    start_time = time.time()
+    with torch.no_grad():
+        outputs = runner.generate(
+            batch_input_ids,
+            max_new_tokens=args.max_output_len,
+            max_attention_window_size=args.max_attention_window_size,
+            sink_token_length=args.sink_token_length,
+            end_id=end_id,
+            pad_id=pad_id,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            num_beams=args.num_beams,
+            length_penalty=args.length_penalty,
+            repetition_penalty=args.repetition_penalty,
+            presence_penalty=args.presence_penalty,
+            frequency_penalty=args.frequency_penalty,
+            stop_words_list=stop_words_list,
+            bad_words_list=bad_words_list,
+            lora_uids=args.lora_task_uids,
+            prompt_table_path=args.prompt_table_path,
+            prompt_tasks=args.prompt_tasks,
+            streaming=args.streaming,
+            output_sequence_lengths=True,
+            return_dict=True,
+            medusa_choices=args.medusa_choices)
+        torch.cuda.synchronize()
+        
+    status = False
+    end_time = time.time()
+    if runtime_rank == 0:
+        num_inputs = sum([torch.numel(x) for x in batch_input_ids])
+        num_outputs = torch.numel(outputs["output_ids"])
+        num_gens = num_outputs - num_inputs
+        
+        load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine")
+        qps = num_gens/(end_time-start_time)
+        logger.info(f'Load engine takes: {load_engine_time} sec')
+        print(f"input tokens: {num_inputs}, generate tokens: {num_gens}, QPS: {qps}")
+        status = check_status(args, load_engine_time, qps)
+    else:
+        status = True
+
+    if args.streaming:
+        for curr_outputs in throttle_generator(outputs,
+                                               args.streaming_interval):
+            if runtime_rank == 0:
+                output_ids = curr_outputs['output_ids']
+                sequence_lengths = curr_outputs['sequence_lengths']
+                print_output(tokenizer,
+                             output_ids,
+                             input_lengths,
+                             sequence_lengths,
+                             output_csv=args.output_csv,
+                             output_npy=args.output_npy)
+    else:
+        if runtime_rank == 0:
+            output_ids = outputs['output_ids']
+            sequence_lengths = outputs['sequence_lengths']
+            context_logits = None
+            generation_logits = None
+            if runner.gather_context_logits:
+                context_logits = outputs['context_logits']
+            if runner.gather_generation_logits:
+                generation_logits = outputs['generation_logits']
+            print_output(tokenizer,
+                         output_ids,
+                         input_lengths,
+                         sequence_lengths,
+                         output_csv=args.output_csv,
+                         output_npy=args.output_npy,
+                         context_logits=context_logits,
+                         generation_logits=generation_logits,
+                         output_logits_npy=args.output_logits_npy)
+
+    if args.run_profiling:
+        ite = 10
+        # warmup
+        for _ in range(ite):
+            with torch.no_grad():
+                outputs = runner.generate(
+                    batch_input_ids,
+                    max_new_tokens=args.max_output_len,
+                    max_attention_window_size=args.max_attention_window_size,
+                    end_id=end_id,
+                    pad_id=pad_id,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    length_penalty=args.length_penalty,
+                    repetition_penalty=args.repetition_penalty,
+                    presence_penalty=args.presence_penalty,
+                    frequency_penalty=args.frequency_penalty,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    lora_uids=args.lora_task_uids,
+                    prompt_table_path=args.prompt_table_path,
+                    prompt_tasks=args.prompt_tasks,
+                    streaming=args.streaming,
+                    output_sequence_lengths=True,
+                    return_dict=True)
+                torch.cuda.synchronize()
+
+        tensorrt_llm.profiler.start("tmp")
+        for _ in range(ite):
+            with torch.no_grad():
+                outputs = runner.generate(
+                    batch_input_ids,
+                    max_new_tokens=args.max_output_len,
+                    max_attention_window_size=args.max_attention_window_size,
+                    end_id=end_id,
+                    pad_id=pad_id,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    length_penalty=args.length_penalty,
+                    repetition_penalty=args.repetition_penalty,
+                    presence_penalty=args.presence_penalty,
+                    frequency_penalty=args.frequency_penalty,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    lora_uids=args.lora_task_uids,
+                    prompt_table_path=args.prompt_table_path,
+                    prompt_tasks=args.prompt_tasks,
+                    streaming=args.streaming,
+                    output_sequence_lengths=True,
+                    return_dict=True)
+                torch.cuda.synchronize()
+        tensorrt_llm.profiler.stop("tmp")
+
+        print(
+            f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec"
+        )
+    if status:
+        print("successful.")
+    else:
+        print("failed.")
+    sys.exit(int(not status))
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    print(args)
+    main(args)
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/requirements.txt b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f7cbbb8b7e9bbd8aab6303fd8b5de1dacbd353b8
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/requirements.txt
@@ -0,0 +1,30 @@
+accelerate
+build
+colored
+# cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image.
+diffusers
+lark
+mpi4py
+numpy
+onnx>=1.12.0
+polygraphy
+psutil
+pybind11
+pynvml>=11.5.0
+sentencepiece>=0.1.99
+# tensorrt==9.2.0.post12.dev5
+# torch
+# nvidia-ammo~=0.5.0; platform_machine=="x86_64"
+transformers
+wheel
+optimum
+evaluate
+janus
+parameterized
+scikit-learn
+
+# special
+scipy==1.11.4
+pandas==1.5.3
+nltk
+rouge_score
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/set_environment.sh b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/set_environment.sh
new file mode 100644
index 0000000000000000000000000000000000000000..97e20ae79b78f7a6f65ba0837002389a85ae0e7d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/set_environment.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+
+PROJECT_DIR=$1
+
+pip3 install -r "$PROJECT_DIR/scripts/requirements.txt"
+
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1.sh b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..79d1b888ee49a54a7db6cafbddc20c7fbd07498f
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+EXIT_STATUS=0
+LOG_LEVEL=info
+BS=${BS:-1}
+DTYPE=${DTYPE:-"float16"}
+
+PROJECT_DIR="./"
+
+DATASET_DIR=${DATASET_DIR:-"${PROJECT_DIR}/data/datasets_cnn_dailymail"}
+MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-7b-chat"}
+ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}/checkpoints"}
+
+export TLLM_LOG_LEVEL=${LOG_LEVEL}
+export PLUGIN_DTYPE="float16"
+
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+
+export TASK_DATA_PATH=${DATASET_DIR}
+
+# target is 95% of best (load engine time: 11.78, rouge1: 28.53, tps: 37.78)
+python3 ${PROJECT_DIR}/summarize.py \
+--test_trt_llm \
+--log_level ${LOG_LEVEL} \
+--batch_size ${BS}  \
+--data_type ${DTYPE} \
+--hf_model_dir ${MODEL_DIR} \
+--tokenizer_dir ${MODEL_DIR} \
+--tokenizer_type "llama" \
+--engine_dir ${ENGINE_DIR}  \
+--target_load_engine_time 12.4 \
+--tensorrt_llm_rouge1_threshold 27.1    \
+--target_tps 35.89  \
+--use_py_session "$@"; check_status
+exit ${EXIT_STATUS}
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1_build.sh b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1_build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eb3cb06f0db50575598a105a7e7ebe29d08f4b3e
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1_build.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+EXIT_STATUS=0
+LOG_LEVEL=info
+BS=${BS:-1}
+DTYPE=${DTYPE:-"float16"}
+
+PROJECT_DIR="./"
+
+MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-7b-chat"}
+OUTPUT_DIR=${OUTPUT_DIR:-"${PROJECT_DIR}/checkpoints/"}
+
+export TLLM_LOG_LEVEL=${LOG_LEVEL}
+export PLUGIN_DTYPE="float16"
+
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+
+python3 ${PROJECT_DIR}/build.py \
+--log_level ${LOG_LEVEL} \
+--dtype ${DTYPE} \
+--model_dir ${MODEL_DIR} \
+--remove_input_padding \
+--use_gpt_attention_plugin float16 --use_gemm_plugin float16 \
+--enable_context_fmha \
+--disable_xqa \
+--output_dir ${OUTPUT_DIR} "$@"; check_status
+exit ${EXIT_STATUS}
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/summarize.py b/models/nlp/large_language_model/llama2-7b/trtllm/summarize.py
new file mode 100644
index 0000000000000000000000000000000000000000..acf06abd7708c098c30a40bc905a52d84d83deb6
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/summarize.py
@@ -0,0 +1,724 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import os
+from pathlib import Path
+import sys
+import time
+
+import evaluate
+import numpy as np
+import torch
+from datasets import load_dataset, load_from_disk
+from transformers import (AutoModel, AutoModelForCausalLM,
+                          AutoModelForSeq2SeqLM, GenerationConfig)
+from utils import DEFAULT_HF_MODEL_DIRS, load_tokenizer, read_model_name
+
+import tensorrt_llm
+import tensorrt_llm.profiler as profiler
+from tensorrt_llm._utils import str_dtype_to_torch
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
+from tensorrt_llm.tools.ppl import ppl
+
+if PYTHON_BINDINGS:
+    from tensorrt_llm.runtime import ModelRunnerCpp
+
+
+def check_status(args, load_engine_time, rouge1, tps):
+    print("==================== check status ====================")
+    successful = True
+    if args.target_load_engine_time != 0 and load_engine_time > args.target_load_engine_time:
+        print(f"Load engine time check failed! Target: {args.target_load_engine_time}, Actual: {load_engine_time}")
+        successful = False
+    if rouge1 < args.tensorrt_llm_rouge1_threshold:
+        print(f"Accuracy check failed! Target: {args.tensorrt_llm_rouge1_threshold}%, Actual: {rouge1}%")
+        successful = False
+    if args.target_tps != 0 and tps < args.target_tps:
+        print(f"Performance check failed! Target: {args.target_tps}, Actual: {tps}")
+        successful = False
+    return successful
+
+
+def main(args):
+    runtime_rank = tensorrt_llm.mpi_rank()
+    logger.set_level(args.log_level)
+
+    model_name = read_model_name(args.engine_dir)
+    if args.hf_model_dir is None:
+        args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name]
+    if args.tokenizer_dir is None:
+        args.tokenizer_dir = args.hf_model_dir
+
+    test_hf = args.test_hf and runtime_rank == 0  # only run hf on rank 0
+    test_trt_llm = args.test_trt_llm
+    profiler.start('load tokenizer')
+    tokenizer, pad_id, end_id = load_tokenizer(
+        tokenizer_dir=args.tokenizer_dir,
+        vocab_file=args.vocab_file,
+        model_name=model_name,
+        tokenizer_type=args.tokenizer_type,
+    )
+    profiler.stop('load tokenizer')
+    logger.info(
+        f'Load tokenizer takes: {profiler.elapsed_time_in_sec("load tokenizer")} sec'
+    )
+
+    if args.eval_task == 'code_completion':
+        dataset_name = "openai_humaneval"
+        dataset_revision = None
+        dataset_input_key = 'prompt'
+        dataset_output_key = 'canonical_solution'
+        dataset_split = 'test'
+    elif args.eval_task == 'summarize':
+        dataset_name = "ccdv/cnn_dailymail"
+        dataset_revision = "3.0.0"
+        dataset_input_key = 'article'
+        dataset_output_key = 'highlights'
+        dataset_split = 'test'
+    elif args.eval_task == 'summarize_long':
+        dataset_name = "tau/zero_scrolls"
+        dataset_revision = 'squality'
+        dataset_input_key = 'input'
+        dataset_output_key = 'output'
+        dataset_split = 'validation'  # only this split contains reference strings
+    
+    
+    logger.info(f"prepare datasets....")
+    if os.getenv("TASK_DATA_PATH"):
+        dataset = load_from_disk(os.getenv("TASK_DATA_PATH"))[dataset_split]
+    else:
+        # dataset = load_dataset(dataset_name,
+        #                     dataset_revision,
+        #                     cache_dir=args.dataset_path,
+        #                     split=dataset_split,
+        #                     trust_remote_code=True)
+        
+        dataset = load_dataset(dataset_name,
+                            dataset_revision,
+                            cache_dir=args.dataset_path,
+                            split=dataset_split)
+
+    logger.info(f"datasets is ready.")
+    max_batch_size = args.batch_size
+
+    # runtime parameters
+    top_k = args.top_k
+    top_p = args.top_p
+    output_len = args.output_len
+    test_token_num = args.max_input_length
+    max_attention_window_size = args.max_attention_window_size
+    sink_token_length = args.sink_token_length
+
+    # random_seed = 5
+    temperature = args.temperature
+    num_beams = args.num_beams
+    length_penalty = args.length_penalty
+    repetition_penalty = args.repetition_penalty
+    presence_penalty = args.presence_penalty
+    frequency_penalty = args.frequency_penalty
+
+    if test_trt_llm:
+        if not PYTHON_BINDINGS and not args.use_py_session:
+            logger.warning(
+                "Python bindings of C++ session is unavailable, fallback to Python session."
+            )
+            args.use_py_session = True
+        runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
+        runner_kwargs = dict(engine_dir=args.engine_dir,
+                             rank=runtime_rank,
+                             debug_mode=args.debug_mode)
+        if args.medusa_choices is not None:
+            args.medusa_choices = ast.literal_eval(args.medusa_choices)
+            assert args.use_py_session, "Medusa is only supported by py_session"
+            assert args.temperature == 0, "Medusa should use temperature == 0"
+            assert args.num_beams == 1, "Medusa should use num_beams == 1"
+            runner_kwargs.update(medusa_choices=args.medusa_choices)
+        if not args.use_py_session:
+            runner_kwargs.update(
+                max_batch_size=max_batch_size,
+                max_input_len=test_token_num,
+                max_output_len=output_len,
+                max_beam_width=num_beams,
+                max_attention_window_size=max_attention_window_size,
+                sink_token_length=sink_token_length)
+        runner = runner_cls.from_dir(**runner_kwargs)
+        assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \
+            "PPL evaluation requires engine built with gather_all_token_logits enabled"
+
+    if test_hf:
+        profiler.start('load HF model')
+        dtype_alias_mapping = {
+            'fp32': 'float32',
+            'fp16': 'float16',
+            'bf16': 'bfloat16'
+        }
+        args.data_type = dtype_alias_mapping.get(args.data_type, args.data_type)
+        if model_name.startswith('chatglm'):
+            auto_model_cls = AutoModel
+        elif model_name.startswith('glm'):
+            auto_model_cls = AutoModelForSeq2SeqLM
+        else:
+            auto_model_cls = AutoModelForCausalLM
+        model = auto_model_cls.from_pretrained(
+            args.hf_model_dir,
+            trust_remote_code=True,
+            torch_dtype=str_dtype_to_torch(args.data_type),
+            device_map='auto' if args.hf_device_map_auto else None)
+        try:
+            model.to_bettertransformer()
+        except ValueError as e:
+            logger.warning(
+                f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}'
+            )
+        if not args.hf_device_map_auto:
+            model.cuda()
+        if model_name == 'qwen':
+            model.generation_config = GenerationConfig.from_pretrained(
+                args.hf_model_dir, trust_remote_code=True)
+        profiler.stop('load HF model')
+        logger.info(
+            f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec'
+        )
+
+    output_dir = Path(args.output_dir) if args.output_dir else None
+    if output_dir is not None:
+        output_dir.mkdir(exist_ok=True, parents=True)
+        if test_trt_llm:
+            with (output_dir / 'trtllm.out').open('w') as f:
+                f.write(f'Engine path: {args.engine_dir}\n')
+                f.write(f'Tokenizer path: {args.tokenizer_dir}\n')
+        if test_hf:
+            with (output_dir / 'hf.out').open('w') as f:
+                f.write(f'Model path: {args.hf_model_dir}\n')
+                f.write(f'Tokenizer path: {args.tokenizer_dir}\n')
+
+    def _prepare_inputs(batch_input_texts,
+                        eval_task='summarize',
+                        add_special_tokens=True):
+        batch_size = len(batch_input_texts)
+        append_str = ' TL;DR: ' if eval_task == 'summarize' else ''
+        batch_input_ids = []
+        for i in range(batch_size):
+            curr_text = batch_input_texts[i] + append_str
+            curr_text = curr_text.strip().replace(" n't", "n't")
+
+            # TODO: The below lines are used to be compatible with the original code; may need fix
+            if model_name.startswith(('chatglm2', 'chatglm3')):
+                input_ids = tokenizer.encode(curr_text,
+                                             return_tensors='pt').squeeze(0)
+                input_ids = input_ids[:test_token_num]
+            elif model_name == 'qwen':
+                from qwen.utils.utils import make_context
+                # use make_content to generate prompt
+                system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user."
+                _, input_id_list = make_context(
+                    tokenizer=tokenizer,
+                    query=curr_text,
+                    history=[],
+                    system=system_prompt,
+                    max_input_length=test_token_num,
+                )
+                input_ids = torch.tensor(input_id_list)
+            else:
+                input_ids = tokenizer.encode(
+                    curr_text,
+                    return_tensors='pt',
+                    add_special_tokens=add_special_tokens,
+                    truncation=True,
+                    max_length=test_token_num).squeeze(0)
+
+            batch_input_ids.append(input_ids)
+        return batch_input_ids
+
+    def eval_trt_llm(datapoint,
+                     eval_task='summarize',
+                     eval_ppl=False,
+                     add_special_tokens=True):
+        batch_size = len(datapoint[dataset_input_key])
+        batch_input_ids = _prepare_inputs(datapoint[dataset_input_key],
+                                          eval_task=eval_task,
+                                          add_special_tokens=add_special_tokens)
+        input_lengths = [x.size(0) for x in batch_input_ids]
+
+        with torch.no_grad():
+            outputs = runner.generate(
+                batch_input_ids,
+                max_new_tokens=output_len,
+                max_attention_window_size=max_attention_window_size,
+                sink_token_length=sink_token_length,
+                end_id=end_id,
+                pad_id=pad_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                num_beams=num_beams,
+                length_penalty=length_penalty,
+                repetition_penalty=repetition_penalty,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                output_sequence_lengths=True,
+                return_dict=True,
+                medusa_choices=args.medusa_choices)
+            torch.cuda.synchronize()
+
+        # Extract a list of tensors of shape beam_width x output_ids.
+        if runtime_rank == 0:
+            output_ids = outputs['output_ids']
+            output_beams_list = [
+                tokenizer.batch_decode(output_ids[batch_idx, :,
+                                                  input_lengths[batch_idx]:],
+                                       skip_special_tokens=True)
+                for batch_idx in range(batch_size)
+            ]
+            output_ids_list = [
+                output_ids[batch_idx, :, input_lengths[batch_idx]:]
+                for batch_idx in range(batch_size)
+            ]
+
+            ppls = [[] for _ in range(batch_size)]
+            seq_lengths_array = outputs["sequence_lengths"].cpu().tolist()
+            lengths_info = {
+                'input_lengths': input_lengths,
+                'seq_lengths': seq_lengths_array
+            }
+            if eval_ppl:
+                seq_lengths = outputs['sequence_lengths']
+                context_logits = outputs['context_logits']
+                # Remove the first generation logits which are same to last context logits
+                generation_logits = outputs['generation_logits'][:, :, 1:]
+                for batch_idx in range(batch_size):
+                    # [batch, beam, step]
+                    for beam_idx in range(num_beams):
+                        curr_len = seq_lengths[batch_idx, beam_idx]
+                        curr_ctx_len = input_lengths[batch_idx]
+                        curr_gen_len = curr_len - curr_ctx_len
+
+                        curr_ids = output_ids[batch_idx, beam_idx, 1:curr_len]
+                        curr_logits = torch.cat([
+                            context_logits[batch_idx],
+                            generation_logits[batch_idx,
+                                              beam_idx, :curr_gen_len - 1]
+                        ],
+                                                dim=0)
+                        curr_ppl = ppl(curr_logits, curr_ids)
+                        logger.debug(
+                            f"TensorRT-LLM PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}"
+                        )
+                        ppls[batch_idx].append(curr_ppl)
+
+            return output_beams_list, output_ids_list, ppls, lengths_info
+        return [], [], [], {}
+
+    def eval_hf(datapoint,
+                eval_task='summarize',
+                eval_ppl=False,
+                add_special_tokens=True):
+        batch_size = len(datapoint[dataset_input_key])
+        if batch_size > 1:
+            logger.warning(
+                f"HF does not support batch_size > 1 to verify correctness due to padding. Current batch size is {batch_size}"
+            )
+        batch_input_ids = _prepare_inputs(datapoint[dataset_input_key],
+                                          eval_task=eval_task,
+                                          add_special_tokens=add_special_tokens)
+        input_lengths = [x.size(0) for x in batch_input_ids]
+        # Left padding for HF
+        max_length = max(input_lengths)
+        paddings = [
+            torch.ones(max_length - l, dtype=torch.int32) * pad_id
+            for l in input_lengths
+        ]
+        batch_input_ids = [
+            torch.cat([pad, x]) for x, pad in zip(batch_input_ids, paddings)
+        ]
+        batch_input_ids = torch.stack(batch_input_ids)
+        batch_input_ids = batch_input_ids.cuda()
+
+        with torch.no_grad():
+            outputs = model.generate(batch_input_ids,
+                                     max_new_tokens=output_len,
+                                     top_k=top_k,
+                                     temperature=temperature,
+                                     eos_token_id=end_id,
+                                     pad_token_id=pad_id,
+                                     num_beams=num_beams,
+                                     num_return_sequences=num_beams,
+                                     early_stopping=True,
+                                     length_penalty=length_penalty,
+                                     output_scores=True,
+                                     return_dict_in_generate=True)
+            if eval_ppl and batch_size == 1:
+                # model.generate cannot return context logits?
+                # Will cause additional latency
+                context_outputs = model(batch_input_ids)
+
+        output_ids = outputs['sequences']
+        tokens_list = output_ids[:, len(batch_input_ids[0]):].tolist()
+        output_ids = output_ids.reshape([batch_size, num_beams, -1])
+        output_lines_list = [
+            tokenizer.batch_decode(output_ids[:, i,
+                                              len(batch_input_ids[0]):],
+                                   skip_special_tokens=True)
+            for i in range(num_beams)
+        ]
+
+        ppls = [[] for _ in range(batch_size)]
+        if eval_ppl and batch_size == 1:
+            # Only for batch size of 1
+            seq_lens = (output_ids != end_id).logical_and(
+                output_ids != pad_id).sum(dim=-1)
+            context_logits = context_outputs['logits']
+            # Remove the first generation logits which are same to last context logits
+            generation_logits = torch.stack(outputs['scores'][1:], dim=1)
+            _, max_gen_len, voc_size = generation_logits.size()
+            generation_logits = generation_logits.view(batch_size, num_beams,
+                                                       max_gen_len, voc_size)
+            for batch_idx in range(batch_size):
+                for beam_idx in range(num_beams):
+                    curr_len = seq_lens[batch_idx, beam_idx]
+                    curr_ctx_len = input_lengths[batch_idx]
+                    curr_gen_len = curr_len - curr_ctx_len
+
+                    curr_ids = output_ids[batch_idx, beam_idx, 1:curr_len]
+                    curr_logits = torch.cat([
+                        context_logits[batch_idx],
+                        generation_logits[batch_idx,
+                                          beam_idx, :curr_gen_len - 1]
+                    ],
+                                            dim=0)
+                    curr_ppl = ppl(curr_logits, curr_ids)
+                    logger.debug(
+                        f"HF PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}"
+                    )
+                    ppls[batch_idx].append(curr_ppl)
+
+        return output_lines_list, tokens_list, ppls
+
+    if test_trt_llm:
+        datapoint = dataset[0:1]
+        output, *_ = eval_trt_llm(datapoint,
+                                  eval_task=args.eval_task,
+                                  eval_ppl=args.eval_ppl,
+                                  add_special_tokens=args.add_special_tokens)
+        if runtime_rank == 0:
+            logger.info(
+                "---------------------------------------------------------")
+            logger.info("TensorRT-LLM Generated : ")
+            logger.info(f" Input : {datapoint[dataset_input_key]}")
+            logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
+            logger.info(f"\n Output : {output}")
+            logger.info(
+                "---------------------------------------------------------")
+    if test_hf:
+        datapoint = dataset[0:1]
+        output, *_ = eval_hf(datapoint,
+                             eval_task=args.eval_task,
+                             eval_ppl=args.eval_ppl,
+                             add_special_tokens=args.add_special_tokens)
+        logger.info("---------------------------------------------------------")
+        logger.info("HF Generated : ")
+        logger.info(f" Input : {datapoint[dataset_input_key]}")
+        logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
+        logger.info(f"\n Output : {output}")
+        logger.info("---------------------------------------------------------")
+
+    # TODO: Add random_seed flag in gptj
+    metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)]
+    metric_hf = [evaluate.load("rouge") for _ in range(num_beams)]
+    for i in range(num_beams):
+        metric_tensorrt_llm[i].seed = 0
+        metric_hf[i].seed = 0
+    ppls_trt_llm = [[] for _ in range(num_beams)]
+    ppls_hf = [[] for _ in range(num_beams)]
+
+    ite_count = 0
+    data_point_idx = 0
+    total_output_token_count_trt_llm = 0  # only valid for runtime_rank == 0
+    
+    if args.stability_test:
+        logger.info(f"stability test, need {args.stability_test_hours} hours")
+    else:
+        logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}")
+    stability_start_time = time.time()
+    while (data_point_idx < len(dataset)) and (ite_count < args.max_ite):
+        if runtime_rank == 0:
+            logger.debug(
+                f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}"
+            )
+        datapoint = dataset[data_point_idx:(data_point_idx + max_batch_size)]
+
+        if test_trt_llm:
+            profiler.start('tensorrt_llm')
+            output_tensorrt_llm, output_ids_trt_llm, curr_ppls_trt_llm, lengths_info = eval_trt_llm(
+                datapoint,
+                eval_task=args.eval_task,
+                eval_ppl=args.eval_ppl,
+                add_special_tokens=args.add_special_tokens)
+            profiler.stop('tensorrt_llm')
+            if runtime_rank == 0:
+                input_lengths = lengths_info['input_lengths']
+                seq_lengths = lengths_info['seq_lengths']
+                output_token_count_trt_llm = sum(
+                    seq_lengths[idx][0] - input_lengths[idx]
+                    for idx in range(len(input_lengths)))
+                total_output_token_count_trt_llm += output_token_count_trt_llm
+
+        if test_hf:
+            profiler.start('hf')
+            output_hf, _, curr_ppls_hf = eval_hf(
+                datapoint,
+                eval_task=args.eval_task,
+                eval_ppl=args.eval_ppl,
+                add_special_tokens=args.add_special_tokens)
+            profiler.stop('hf')
+
+        if runtime_rank == 0:
+            if test_trt_llm:
+                for batch_idx in range(len(output_tensorrt_llm)):
+                    for beam_idx in range(num_beams):
+                        metric_tensorrt_llm[beam_idx].add_batch(
+                            predictions=[
+                                output_tensorrt_llm[batch_idx][beam_idx]
+                            ],
+                            references=[
+                                datapoint[dataset_output_key][batch_idx]
+                            ])
+                        if args.eval_ppl:
+                            ppls_trt_llm[beam_idx].append(
+                                curr_ppls_trt_llm[batch_idx][beam_idx])
+                if output_dir is not None:
+                    # yapf: disable
+                    for i in range(len(output_tensorrt_llm[0])):
+                        for beam_idx in range(num_beams):
+                            with (output_dir / 'trtllm.out').open('a') as f:
+                                f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n')
+                    # yapf: enable
+            if test_hf:
+                for beam_idx in range(num_beams):
+                    for batch_idx in range(len(output_hf[beam_idx])):
+                        metric_hf[beam_idx].add_batch(
+                            predictions=[output_hf[beam_idx][batch_idx]],
+                            references=[
+                                datapoint[dataset_output_key][batch_idx]
+                            ])
+                        if args.eval_ppl and args.batch_size == 1:
+                            ppls_hf[beam_idx].append(
+                                curr_ppls_hf[batch_idx][beam_idx])
+                if output_dir is not None:
+                    # yapf: disable
+                    for i in range(len(output_hf[0])):
+                        for beam_idx in range(num_beams):
+                            with (output_dir / 'hf.out').open('a') as f:
+                                f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n')
+                    # yapf: enable
+
+            logger.debug('-' * 100)
+            logger.debug(f"Input : {datapoint[dataset_input_key]}")
+            if test_trt_llm:
+                logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}')
+            if test_hf:
+                logger.debug(f'HF Output: {output_hf}')
+            logger.debug(f"Reference : {datapoint[dataset_output_key]}")
+
+        data_point_idx += max_batch_size
+        ite_count += 1
+        
+        if args.stability_test:
+            test_time_hours = round((time.time() - stability_start_time)/3600, 1)
+            if test_time_hours > args.stability_test_hours:
+                if runtime_rank == 0:
+                    logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.")
+                break
+            else:
+                data_point_idx = data_point_idx % len(dataset)
+                ite_count = ite_count % args.max_ite
+                if runtime_rank == 0 and ite_count % 100 == 0:
+                    logger.info(f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours")
+        elif runtime_rank == 0 and ite_count % 10 == 0:
+            logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}")
+
+    if runtime_rank == 0:
+        if test_trt_llm:
+            np.random.seed(0)  # rouge score use sampling to compute the score
+            logger.info(
+                f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)'
+            )
+            logger.info(
+                f'TensorRT-LLM (total output tokens: {total_output_token_count_trt_llm})'
+            )
+            logger.info(
+                f'TensorRT-LLM (tokens per second: {total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm")})'
+            )
+            
+            rouge1 = 0
+            tps = total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm")
+            
+            for beam_idx in range(num_beams):
+                logger.info(f"TensorRT-LLM beam {beam_idx} result")
+                computed_metrics_tensorrt_llm = metric_tensorrt_llm[
+                    beam_idx].compute()
+                for key in computed_metrics_tensorrt_llm.keys():
+                    logger.info(
+                        f'  {key} : {computed_metrics_tensorrt_llm[key]*100}')
+
+                if args.check_accuracy and beam_idx == 0:
+                    assert computed_metrics_tensorrt_llm[
+                        'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold
+                    
+                if beam_idx == 0:
+                    rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100
+                    
+                if args.eval_ppl:
+                    logger.info(
+                        f"  Per-token perplexity: {np.mean(ppls_trt_llm[beam_idx])}"
+                    )
+                    if args.check_accuracy and beam_idx == 0:
+                        assert np.mean(ppls_trt_llm[beam_idx]
+                                       ) < args.tensorrt_llm_ppl_threshold
+                        
+            load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine")
+            logger.info(f'Load engine takes: {load_engine_time} sec')
+
+            status = check_status(args, load_engine_time, rouge1, tps)
+            if status:
+                print("successful.")
+            else:
+                print("failed.")
+            
+            sys.exit(int(not status))
+            
+        if test_hf:
+            np.random.seed(0)  # rouge score use sampling to compute the score
+            logger.info(
+                f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)'
+            )
+            for beam_idx in range(num_beams):
+                logger.info(f"HF beam {beam_idx} result")
+                computed_metrics_hf = metric_hf[beam_idx].compute()
+                for key in computed_metrics_hf.keys():
+                    logger.info(f'  {key} : {computed_metrics_hf[key]*100}')
+                if args.eval_ppl and args.batch_size == 1:
+                    logger.info(
+                        f"  Per-token perplexity: {np.mean(ppls_hf[beam_idx])}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None)
+    parser.add_argument(
+        '--tokenizer_dir',
+        default=None,
+        help='tokenizer path; defaults to hf_model_dir if left unspecified')
+    parser.add_argument(
+        '--tokenizer_type',
+        help=
+        'Specify that argument when providing a .model file as the tokenizer_dir. '
+        'It allows AutoTokenizer to instantiate the correct tokenizer type.')
+    parser.add_argument('--vocab_file')
+    parser.add_argument('--test_hf', action='store_true')
+    parser.add_argument('--test_trt_llm', action='store_true')
+    parser.add_argument(
+        '--data_type',
+        type=str,
+        choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'],
+        default='fp16')
+    parser.add_argument('--engine_dir', type=str, default='engine_outputs')
+    parser.add_argument('--use_py_session',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to use Python runtime session")
+    parser.add_argument(
+        '--eval_task',
+        type=str,
+        default='summarize',
+        choices=['summarize', 'summarize_long', 'code_completion'])
+    parser.add_argument('--check_accuracy', action='store_true')
+    parser.add_argument('--tensorrt_llm_rouge1_threshold',
+                        type=float,
+                        default=15.0)
+    parser.add_argument('--eval_ppl', action='store_true')
+    parser.add_argument('--tensorrt_llm_ppl_threshold',
+                        type=float,
+                        default=15.0)
+    parser.add_argument('--target_load_engine_time',
+                        type=float,
+                        default=0)
+    parser.add_argument('--target_tps',
+                        type=float,
+                        default=0)
+    parser.add_argument('--dataset_path', type=str, default='')
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--max_ite', type=int, default=20)
+    parser.add_argument('--output_len', type=int, default=100)
+    parser.add_argument('--max_input_length', type=int, default=923)
+    parser.add_argument(
+        '--max_attention_window_size',
+        type=int,
+        default=None,
+        help=
+        'The attention window size that controls the sliding window attention / cyclic kv cache behaviour'
+    )
+    parser.add_argument('--sink_token_length',
+                        type=int,
+                        default=None,
+                        help='The sink token length.')
+    parser.add_argument('--num_beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_k', type=int, default=1)
+    parser.add_argument('--top_p', type=float, default=0.0)
+    parser.add_argument('--length_penalty', type=float, default=1.0)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
+    parser.add_argument('--presence_penalty', type=float, default=0.0)
+    parser.add_argument('--frequency_penalty', type=float, default=0.0)
+    parser.add_argument('--debug_mode',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to turn on the debug mode")
+    parser.add_argument('--no_add_special_tokens',
+                        dest='add_special_tokens',
+                        default=True,
+                        action='store_false',
+                        help="Whether or not to add special tokens")
+    parser.add_argument(
+        '--hf_device_map_auto',
+        action='store_true',
+        help="Use device map 'auto' to load a pretrained HF model. This may "
+        "help to test a large model that cannot fit into a singlue GPU.")
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default=None,
+        help="Directory where to save output sentences. 'trtllm.out' for "
+        "TensorRT-LLM outputs, and 'hf.out' for HF outputs.  If None, do not "
+        "save outputs.")
+    parser.add_argument(
+        '--medusa_choices',
+        type=str,
+        default=None,
+        help="Medusa choice to use, if not none, will use Medusa decoding."
+        "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
+    )
+    parser.add_argument('--stability_test',
+                        default=False,
+                        action='store_true',
+                        help="Whether or not to run stability test for tensorrt_llm.")
+    parser.add_argument('--stability_test_hours', type=float, default=24.0)
+    args = parser.parse_args()
+    print(args)
+    main(args)
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/utils.py b/models/nlp/large_language_model/llama2-7b/trtllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..44042d9e2dcb44dd6cd917ab16a00010e4005202
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/utils.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+from typing import Optional
+
+from transformers import AutoTokenizer, T5Tokenizer
+
+import tensorrt_llm
+
+DEFAULT_HF_MODEL_DIRS = {
+    'baichuan': 'baichuan-inc/Baichuan-13B-Chat',
+    'bloom': 'bigscience/bloom-560m',
+    'chatglm_6b': 'THUDM/chatglm-6b',
+    'chatglm2_6b': 'THUDM/chatglm2-6b',
+    'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k',
+    'chatglm3_6b': 'THUDM/chatglm3-6b',
+    'chatglm3_6b_base': 'THUDM/chatglm3-6b-base',
+    'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k',
+    'falcon': 'tiiuae/falcon-rw-1b',
+    'glm_10b': 'THUDM/glm-10b',
+    'gpt': 'gpt2-medium',
+    'gptj': 'EleutherAI/gpt-j-6b',
+    'gptneox': 'EleutherAI/gpt-neox-20b',
+    'internlm': 'internlm/internlm-chat-7b',
+    'llama': 'meta-llama/Llama-2-7b-hf',
+    'mpt': 'mosaicml/mpt-7b',
+    'phi': 'microsoft/phi-2',
+    'opt': 'facebook/opt-350m',
+    'qwen': 'Qwen/Qwen-7B',
+}
+
+DEFAULT_PROMPT_TEMPLATES = {
+    'internlm':
+    "<|User|>:{input_text}<eoh>\n<|Bot|>:",
+    'qwen':
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
+}
+
+
+def read_model_name(engine_dir: str):
+    engine_version = tensorrt_llm.runtime.engine.get_engine_version(engine_dir)
+
+    with open(Path(engine_dir) / "config.json", 'r') as f:
+        config = json.load(f)
+
+    if engine_version is None:
+        return config['builder_config']['name']
+
+    return config['pretrained_config']['architecture']
+
+
+def throttle_generator(generator, stream_interval):
+    for i, out in enumerate(generator):
+        if not i % stream_interval:
+            yield out
+
+    if i % stream_interval:
+        yield out
+
+
+def load_tokenizer(tokenizer_dir: Optional[str] = None,
+                   vocab_file: Optional[str] = None,
+                   model_name: str = 'gpt',
+                   tokenizer_type: Optional[str] = None):
+    if vocab_file is None:
+        use_fast = True
+        if tokenizer_type is not None and tokenizer_type == "llama":
+            use_fast = False
+        # Should set both padding_side and truncation_side to be 'left'
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                  legacy=False,
+                                                  padding_side='left',
+                                                  truncation_side='left',
+                                                  trust_remote_code=True,
+                                                  tokenizer_type=tokenizer_type,
+                                                  use_fast=use_fast)
+    else:
+        # For gpt-next, directly load from tokenizer.model
+        assert model_name == 'gpt'
+        tokenizer = T5Tokenizer(vocab_file=vocab_file,
+                                padding_side='left',
+                                truncation_side='left')
+
+    if model_name == 'qwen':
+        with open(Path(tokenizer_dir) / "generation_config.json") as f:
+            gen_config = json.load(f)
+        chat_format = gen_config['chat_format']
+        if chat_format == 'raw':
+            pad_id = gen_config['pad_token_id']
+            end_id = gen_config['eos_token_id']
+        elif chat_format == 'chatml':
+            pad_id = tokenizer.im_end_id
+            end_id = tokenizer.im_end_id
+        else:
+            raise Exception(f"unknown chat format: {chat_format}")
+    elif model_name == 'glm_10b':
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eop_token_id
+    else:
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eos_token_id
+
+    return tokenizer, pad_id, end_id
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e965773230da3333cb162691394b65c109709677
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
@@ -0,0 +1,44 @@
+# Qwen-7B
+
+## Description
+Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models.
+
+## Setup
+
+### Install
+```bash
+yum install mesa-libGL
+
+# Please contact the staff to obtain the relevant installlation packages.
+pip3 install Path/To/bitsandbytes-xxx.whl
+pip3 install Path/To/flash_atten-xxx.whl
+pip3 install Path/To/ixformer-xxx.whl
+pip3 install Path/To/vllm-xxx.whl
+pip3 install Path/To/eetq-xxx.whl
+pip3 install Path/To/text-generation-xxx.whl
+pip3 install Path/To/text-generation-server-xxx.whl
+```
+
+### Download
+-Model: https://modelscope.cn/models/qwen/Qwen-7B/summary
+
+```bash
+# Make sure the model's file name is qwen-7B
+mkdir data
+```
+
+## Inference
+
+### Start webserver
+#### Single GPU
+```bash
+# Use one docker container to start webserver
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+ENABLE_INFER_PG=1 CUDA_VISIBLE_DEVICES=0 USE_FLASH_ATTENTION=true text-generation-launcher --model-id ./data/qwen-7B --sharded false --dtype float16  --disable-custom-kernels --port 8001 --max-input-length 13312 --max-batch-prefill-tokens 13312 --max-total-tokens 15360 --max-batch-total-tokens 15360
+```
+#### Offline test
+```bash
+# Use another docker container to run offline test
+export CUDA_VISIBLE_DEVICES=1
+python3 offline_inference.py --model2path ./data/qwen-7B
+```
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..57db633401e7849adac36f5f9e6ad166fdf38bbd
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
@@ -0,0 +1,146 @@
+from text_generation_server.models.flash_qwen import (
+        FlashQwen,
+    )
+import torch
+from text_generation_server.pb import generate_pb2
+
+import time
+from torch.cuda import profiler
+import argparse
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--generate_length', type=int, default=512)
+    parser.add_argument('--model2path', type=str, default="/home/data/nlp/qwen/qwen-7B")
+    parser.add_argument('--quantize', type=str, default=None, choices=['awq'])
+    parser.add_argument('--speculate', type=int, default=0)
+
+    return parser.parse_args(args)
+
+if __name__ == "__main__":
+    args = parse_args()
+    isNewVersion = True
+    try:
+        from text_generation_server.utils.speculate import set_speculate
+    except ImportError:
+        isNewVersion = False
+        print("use n-gram speculate must update tgi version to 1.4.3+")
+    else:
+        set_speculate(args.speculate)
+    max_input_length = 2048
+    max_prefill_tokens = 2048
+    model = FlashQwen(args.model2path, trust_remote_code=True)
+
+    first_line = "蒙古国的首都是乌兰巴托（Ulaanbaatar）\n冰岛的首都是雷克雅未克（Reykjavik）\n埃塞俄比亚的首都是"
+
+    default_pb_parameters = generate_pb2.NextTokenChooserParameters(
+        temperature=1.0,
+        repetition_penalty=1.0,
+        top_k=0,
+        top_p=1,
+        typical_p=1.0,
+        do_sample=False,
+    )
+    
+    default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=args.generate_length)
+    
+    warmup_requests =  generate_pb2.Request(
+        id=0,
+        inputs="_test " * max_input_length,
+        prefill_logprobs=True,
+        truncate=max_input_length,
+        parameters=generate_pb2.NextTokenChooserParameters(
+            temperature=0.9,
+            top_k=10,
+            top_p=0.9,
+            typical_p=0.9,
+            do_sample=False,
+            seed=0,
+            repetition_penalty=1.2,
+            watermark=True,
+        ),
+        stopping_parameters=generate_pb2.StoppingCriteriaParameters(
+            max_new_tokens=2,
+            stop_sequences=[],
+            ignore_eos_token=False,
+        ),
+        top_n_tokens = 20
+    )
+    warmup_requests_batch = generate_pb2.Batch(id=0, requests=[warmup_requests], size=1)
+    warmup_requests_batchs =  model.batch_type.from_pb(
+        warmup_requests_batch, model.tokenizer, model.dtype, torch.device("cuda")
+    )
+    
+    model.warmup(warmup_requests_batchs)
+
+    pb_request = generate_pb2.Request(
+        id=1,
+        inputs=first_line,
+        prefill_logprobs=True,
+        truncate=1024,
+        parameters=default_pb_parameters,
+        stopping_parameters=default_pb_stop_parameters,
+    )
+    pb_one_batch = generate_pb2.Batch(id=1, requests=[pb_request], size=1)
+    causal_lm_one_batch = model.batch_type.from_pb(
+        pb_one_batch, model.tokenizer, model.dtype, torch.device("cuda")
+    )
+
+    next_batch_one = causal_lm_one_batch
+    last_generations = True 
+    torch.cuda.synchronize()
+    profiler.start()
+    start_time = time.perf_counter()
+    for _ in range(causal_lm_one_batch.stopping_criterias[0].max_new_tokens - 1):
+        data = model.generate_token(next_batch_one)
+        if isNewVersion:
+            generations_one, next_batch_one, _ = data
+        else:
+            generations_one, next_batch_one = data
+        if next_batch_one is None:
+            last_generations = False
+            break
+    if last_generations:
+        data = model.generate_token(next_batch_one)
+    generations_one = data[0]
+    profiler.stop()
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+    print(f"generate length: {generations_one[0].generated_text.generated_tokens}")
+    print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}")
+
+"""
+qwen-7B
+亚的斯亚贝巴（Addis Ababa）
+尼日利亚的首都是阿布贾（Abuja）
+巴基斯坦的首都是伊斯兰堡（Islamabad）
+菲律宾的首都是马尼拉（Manila）
+波兰的首都是华沙（Warsaw）
+葡萄牙的首都是里斯本（Lisbon）
+俄罗斯的首都是莫斯科（Moscow）
+新加坡的首都是新加坡（Singapore）
+南非的首都是比勒陀利亚（Pretoria）
+西班牙的首都是马德里（Madrid）
+斯里兰卡的首都是斯里贾亚瓦德纳普拉克特（Sri Jayawardenepura Kotte）
+斯洛伐克的首都是布拉迪斯拉发（Bratislava）
+斯洛文尼亚的首都是卢布尔雅那（Ljubljana）
+南非的首都是比勒陀利亚（Pretoria）
+瑞典的首都是斯德哥尔摩（Stockholm）
+瑞士的首都是伯尔尼（Bern）
+泰国的首都是曼谷（Bangkok）
+土耳其的首都是安卡拉（Ankara）
+乌克兰的首都是基辅（Kyiv）
+英国的首都是伦敦（London）
+美国的首都是华盛顿特区（Washington, D.C.）
+乌兹别克斯坦的首都是塔什干（Tashkent）
+委内瑞拉的首都是加拉加斯（Caracas）
+越南的首都是河内（Hanoi）
+赞比亚的首都是卢萨卡（Lusaka）
+津巴布韦的首都是哈拉雷（Harare）
+以上是世界上一些国家的首都，当然还有很多其他国家的首都，这里只是列举了一些比较有代表性的。  2022年广东省公务员考试公告于11月26日发布，报考者可在 2021年11月29日9︰00至12月3日16︰00 的时间内报名。建议小伙伴们根据本人的专业、意愿和职业规划等选择报考职位，不要等到最后才匆忙报名，以免因时间不足等情况无法完成报名而造成遗憾。
+   ——2022年广东省考报名有关解答——
+  报考者如何办理考试费减免手续?
+  答：报考者如属城乡最低生活保障对象，可向报考职位所在考区考务机构申请减免考试费，申请对象需提交其家庭所在地的县(区、
+qps: 34.23966521171583
+"""
\ No newline at end of file