diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
new file mode 100644
index 0000000000000000000000000000000000000000..2d7f4c35f4f18b7385598e9a5e4c2d3e3cc272d5
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
@@ -0,0 +1,30 @@
+BSD 3-Clause License
+
+
+Copyright (c) 2017, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e6307976660a136161c1e763f2f810eb9651d7ad
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
@@ -0,0 +1,42 @@
+diff -uNr ascend-llm/export_llama/export_llama.py ascend-llm-qwen/export_llama/export_llama.py
+--- ascend-llm/export_llama/export_llama.py	2024-09-05 15:10:55.831311000 +0800
++++ ascend-llm-qwen/export_llama/export_llama.py	2024-09-05 15:20:09.720307600 +0800
+@@ -2,16 +2,17 @@
+ import importlib
+ import torch
+ import os
+-from transformers import LlamaForCausalLM, LlamaTokenizer
+-
++from transformers import Qwen2ForCausalLM, Qwen2Tokenizer
++import torch_npu
++from torch_npu.contrib import transfer_to_npu
++torch_npu.npu.set_device("npu:3")
+ 
+ def export_onnx(base_model,out_path,quant_cfg_path,act_path):
+-    tokenizer= LlamaTokenizer.from_pretrained(base_model)
+-    model = LlamaForCausalLM.from_pretrained(
++    tokenizer= Qwen2Tokenizer.from_pretrained(base_model)
++    model = Qwen2ForCausalLM.from_pretrained(
+         base_model,
+         torch_dtype=torch.float16,
+-        device_map="auto",
+-    )
++    ).npu()
+     model_cfg=model.model.config
+     spec = importlib.util.spec_from_file_location("quant_cfg_module", quant_cfg_path)
+     quant_cfg_module = importlib.util.module_from_spec(spec)
+diff -uNr ascend-llm/inference/config.py ascend-llm-qwen/inference/config.py
+--- ascend-llm/inference/config.py	2024-09-05 15:10:55.833305200 +0800
++++ ascend-llm-qwen/inference/config.py	2024-09-05 15:20:36.210316200 +0800
+@@ -28,9 +28,9 @@
+     head_len:int= 32 # 在KVCache evict时前head len会被保留
+     recent_len:int = 32 # 在KVCache evict时最近recent len会被保留
+     evict_len:int = 64 # KVCache 逐出的最小值，当KVCache达到最大值时将逐出evict_len个KVCache
+-    n_layer:int = 22
++    n_layer:int = 28
+     format:str='huggingface-tensor' #KVcache的格式
+-    max_cache_size:int=256 # kvcache的最大长度
++    max_cache_size:int=1024 # kvcache的最大长度
+     head_num:int=4
+     num_kv_group:int = 8 # for GQA
+     head_dim:int=64
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
new file mode 100644
index 0000000000000000000000000000000000000000..84e6d03e70ab2c18f31f8fdd827b500842d336ab
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
@@ -0,0 +1,70 @@
+--- modeling_qwen2.py	2024-09-04 22:30:47.490111800 +0800
++++ modeling_qwen2_export.py	2024-09-04 22:49:20.540908500 +0800 
+@@ -162,6 +162,10 @@
+         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+ 
+     def forward(self, x, seq_len=None):
++        return(
++            self.cos_cached.to(dtype=x.dtype),
++            self.sin_cached.to(dtype=x.dtype),
++        )
+         # x: [bs, num_attention_heads, seq_len, head_size]
+         if seq_len > self.max_seq_len_cached:
+             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+@@ -312,6 +316,7 @@
+         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+ 
++        out_key_value = (key_states, value_states) if use_cache else None
+         if past_key_value is not None:
+             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+@@ -351,7 +356,7 @@
+         if not output_attentions:
+             attn_weights = None
+ 
+-        return attn_output, attn_weights, past_key_value
++        return attn_output, attn_weights, out_key_value
+ 
+ 
+ class Qwen2FlashAttention2(Qwen2Attention):
+@@ -895,7 +900,7 @@
+         # decoder layers
+         all_hidden_states = () if output_hidden_states else None
+         all_self_attns = () if output_attentions else None
+-        next_decoder_cache = None
++        next_decoder_cache = [] if use_cache else None
+ 
+         for decoder_layer in self.layers:
+             if output_hidden_states:
+@@ -926,7 +931,11 @@
+             hidden_states = layer_outputs[0]
+ 
+             if use_cache:
+-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
++                key_values = layer_outputs[2 if output_attentions else 1]
++                if isinstance(next_decoder_cache,tuple):
++                    next_decoder_cache=list(next_decoder_cache)
++                assert isinstance(next_decoder_cache, list),"transform failed"
++                next_decoder_cache.extend(layer_outputs[2 if output_hidden_states else 1])
+ 
+             if output_attentions:
+                 all_self_attns += (layer_outputs[1],)
+@@ -937,9 +946,7 @@
+         if output_hidden_states:
+             all_hidden_states += (hidden_states,)
+ 
+-        next_cache = None
+-        if use_cache:
+-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
++        next_cache = torch.extend(next_decoder_cache).reshape(len(self.layers),2,*next_decoder_cache[0].shape) if use_cache else None
+ 
+         if not return_dict:
+             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+@@ -1433,4 +1440,4 @@
+             logits=logits,
+             hidden_states=outputs.hidden_states,
+             attentions=outputs.attentions,
+-        )
+\ No newline at end of file
++        )
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0147f8fdf5c24c2765495bedf1ad9b3edf0307d9
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
@@ -0,0 +1,58 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2017, 
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#  list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#  contributors may be used to endorse or promote products derived from
+#  this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import onnx
+import onnx.helper as helper 
+from onnx import TensorProtos
+
+model = onnx.load("qwen2.onnx")
+new_nodes = []
+
+for node in model.graph.node:
+    new_nodes = node
+    if node.op_tyoe == "Trilu":
+        new_node = helper.make_node(
+            "Trilu",
+            inputs=[node.input[0]],
+            outputs=node.output,
+            upper=1
+        )
+    new_nodes.append(new_node)
+    
+new_graph = helper.make_graph(
+    new_nodes,
+    "new_graph",
+    inputs=model.graph.input,
+    outputs=model.graph.output,
+    value_info=model.graph.value_info,
+    initializer=model.graph.initializer
+)
+
+new_model = helper.make_model(new_graph, producer_name=model.producer_name, opset_imports=model.opset_import, ir_version=model.ir_version)
+onnx.save(new_model, "qwen2.onnx", save_as_external_data=True)
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..6964a3f5b9ebcdd7b6e4516360f2244f1bbca8b0
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
@@ -0,0 +1,201 @@
+# README
+
+-此README对qwen2-7b 在310b环境离线模型导出与推理脚本及其使用方式进行介绍
+
+- [概述](#ZH-CN_TOPIC_0000001172161501)
+
+    - [输入输出数据](#section540883920406)
+
+- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
+
+- [快速上手](#ZH-CN_TOPIC_0000001126281700)
+
+  - [获取源码](#section4622531142816)
+  - [模型转换与推理](#section741711594517)
+
+
+# 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
+
+通义千问是阿里云自主研发的超大规模语言模型，能够回答问题、创作文字，还能表达观点、撰写代码。
+
+## 输入输出数据<a name="section540883920406"></a>
+
+ 输入数据
+
+  | 输入数据        | 数据类型 | 大小                                 | 数据排布格式 |
+  | --------------- | -------- | ------------------------------------ | ------------ |
+  | input_ids       | int64    | 1 x 1                                | ND           |
+  | attention_mask  | int64    | 1 x 1                                | ND           |
+  | position_ids    | int64    | 1 x 1                                | ND           |
+  | past_key_values | int64    | layers,2,1,n_heads, kv_len, head_dim | ND           |
+
+- 输出数据
+
+  | 输出数据       | 数据类型 | 大小                          | 数据排布格式 |
+  | -------------- | -------- | ----------------------------- | ------------ |
+  | logits         | FLOAT32  | 1 x vocab_size                | ND           |
+  | out_key_values | FLOAT16  | layers,2,1,36,kv_len,head_dim | ND           |
+
+# 推理环境准备<a name="ZH-CN_TOPIC_0000001126281702"></a>
+
+  **表 1**  版本配套表
+
+  | 配套                                                 | 版本                | 取包地址环境准备指导                                                                                            |
+  | ---------------------------------------------------- | ------------------- | --------------------------------------------------------------------------------------------------------------- |
+  | 固件与驱动                                           | Ascend HDK 24.1.RC3 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies)           |
+  | CANN                                                 | CANN 8.0.RC3        | https://cmc-szv.clouddragon.huawei.com/cmcversion/index/releaseView?deltaId=10860207193326848&isSelect=Software |
+  | Python                                               | 3.9.19              | -                                                                                                               |
+  | PyTorch                                              | 2.1.0               | -                                                                                                               |
+  | 说明：310B推理卡请以CANN版本选择实际固件与驱动版本。 | \                   |
+
+# 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
+
+## 获取源码<a name="section4622531142816"></a>
+
+1. 获取源码。
+
+   ```
+    # 获取源码 commitId ：1392d7f 此代码库已经不更新，可以按下面取最新版
+    git clone https://gitee.com/yinghuo302/ascend-llm
+    cd ascend-llm   
+    patch -p0 < diff.patch
+    cp $install_python_path/lib/site-packages/transformers/models/qwen2/modeling_qwen2.py .
+    patch -p0 < diff_model.patch
+    cp modeling_qwen2.py $install_python_path/lib/site-packages/transformers/models/qwen2
+   ```
+   
+2. 安装依赖。
+
+   ```
+   pip install -r requirements.txt
+   ```
+
+## 模型转换与推理<a name="section741711594517"></a>
+
+1. 环境搭建。
+
+   - protoc安装
+      
+      根据昇腾文档选择合适的protoc,此版本配套使用的protoc版本最低为 1.13.0  
+      进入https://github.com/protocolbuffers/protobuf/releases下载对应版本
+      ```
+      # 安装protoc==1.13.0， 找一空闲目录下载
+      tar -zxvf protobuf-all-3.13.0.tar.gz
+      cd protobuf-3.13.0
+      apt-get update
+      apt-get install autoconf automake libtool
+      ./autogen.sh 
+      ./configure
+      make -j4
+      make install
+      sudo ldconfig
+      protoc --version # 查看版本号
+      ```
+
+   - 算子编译部署
+      ```
+      # 将./custom_op/matmul_integer_plugin.cc 拷贝到指定路径
+      cd MiniCPM_for_Pytorch
+      export ASCEND_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      cp custom_op/matmul_integer_plugin.cc $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx/framework/onnx_plugin/
+      cd $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx 
+      ```
+      打开build.sh，找到下面四个环境变量，解开注释并修改如下：
+      ```
+      export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/include
+      export TOOLCHAIN_DIR=/usr
+      export AICPU_KERNEL_TARGET=cust_aicpu_kernels
+      export AICPU_SOC_VERSION=Ascend310B4
+      ```
+   - 编译运行
+      ```
+      ./build.sh 
+      cd build_out/
+      ./custom_opp_ubuntu_aarch64.run
+      # 生成文件到customize到默认目录 $ASCEND_PATH/opp/vendors/，删除冗余文件
+      cd $ASCEND_PATH/opp/vendors/customize
+      rm -rf op_impl/ op_proto/
+      ```
+
+2. 模型转换(进入export_llama目录)。
+
+
+   1). 导出onnx模型。
+
+       
+       python export_llama.py --model ${模型文件路径} --output ${输出onnx文件路径} 
+       
+       
+- 参数说明：  
+         - model_name: 模型名称  
+         - model_type: 模型类型  
+         - save_path: 模型权重保存文件夹  
+
+
+   2). 使用ATC工具将ONNX模型转OM模型。
+
+1. 配置环境变量。
+
+         
+          source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+
+2. 执行命令查看芯片名称（$\{chip\_name\}）。
+
+         
+         npu-smi info
+
+#该设备芯片名为Ascend310P3 （自行替换）  
+会显如下：  
+
+         +-------------------+-----------------+------------------------------------------------------+
+         | NPU     Name      | Health          | Power(W)     Temp(C)           Hugepages-Usage(page) |
+         | Chip    Device    | Bus-Id          | AICore(%)    Memory-Usage(MB)                        |
+         +===================+=================+======================================================+
+         | 0       310P3     | OK              | 15.8         42                0    / 0              |
+         | 0       0         | 0000:82:00.0    | 0            1074 / 21534                            |
+         +===================+=================+======================================================+
+         | 1       310P3     | OK              | 15.4         43                0    / 0              |
+         | 0       1         | 0000:89:00.0    | 0            1070 / 21534                            |
+         +===================+=================+======================================================+
+         ```
+
+3. 执行ATC命令。
+         
+          atc --framework=5 --model=${onnx文件路径}  --output=${输出文件名} --input_format=ND --input_shape="input_ids:1,1;attention_mask:1,1025;position_ids:1,1;past_key_values:28,2,1,4,1024,128" --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype
+          
+
+- 参数说明：
+
+           - model：为ONNX模型文件。  
+           - framework：5代表ONNX模型。  
+           - output：输出的OM模型。
+           - input\_format：输入数据的格式。
+           - input\_shape：输入数据的shape。
+           - log：日志级别。
+           - soc\_version：处理器型号。
+   
+
+           运行成功后生成om后缀的模型文件。
+
+3. 开始推理验证。  
+   1). 执行推理前准备工作:  
+        A）在端侧设备上如310B1 上安装对应cann，驱动等  
+        B）进入inference, 安装相关依赖 pip install -r requirements.txt  
+
+   2). 执行推理:
+
+        python main.py --model ${om文件路径}  --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli 
+
+- 参数说明：               
+             -   model：om模型路径  
+             -   hf-dir：需要tokenizer和模型配置文件，权重不需要   
+             -   engine：310B上只能acl  
+             -   sampling：greedy/top_p/top_k  
+             -   cli：表示在终端运行  
+             说明: 上面参数根据实际情况修改
+
+3.数据集精度验证:  
+       先下载CEval，BoolQ，GSM8K数据集到inference目录下，具体路径为./inference/dataset，将test.py文件放到inference路径下
+       
+         python test.py --model ${om文件路径} --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli --dataset=BoolQ/CEval/GSM8K
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16613f21193dbbb8e5bdb825a17d6efff8b12d38
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
@@ -0,0 +1,5 @@
+torch==2.1.0
+torch_npu
+onnx
+transformers==4.38.2
+lm-eval==0.4.2 # for eval
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b285cecce8d4a6fb52c7012d86eba3d0dbfec89d
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
@@ -0,0 +1,91 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2017, 
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#  list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#  contributors may be used to endorse or promote products derived from
+#  this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+from config import InferrnceConfig
+from inference import LlamaInterface
+
+def main(cli: bool, engine: LlamaInterface, dataset):
+    if cli:
+        if dataset == 'BoolQ':
+            engine.test_boolq()
+        elif dataset == 'CEval':
+            engine.test_ceval()
+        elif dataset == 'GSM8K':
+            engine.test_gsm8k()
+        else:
+            print("dataset is not support! ")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--cli', dest='cli', default=False, action='store_true',
+        help="run web ui by default, if add --cli, run cli."
+    )
+    parser.add_argument("--kv_size", type=int, default=1024)
+    parser.add_argument(
+        "--engine", type=str, default="acl",
+        help="inference backend, onnx or acl"
+    )
+    parser.add_argument(
+        "--sampling", type=str, default="top_k",
+        help="sampling method, greedy, top_k or top_p"
+    )
+    parser.add_argument(
+        "--sampling_value", type=float,default=10,
+        help="if sampling method is seted to greedy, this argument will be ignored; if top_k, it means value of p; if top_p, it means value of p"
+    )
+    parser.add_argument(
+        "--temperature", type=float,default=0.7,
+        help="sampling temperature if sampling method is seted to greedy, this argument will be ignored."
+    )
+    parser.add_argument(
+        "--hf-dir", type=str, default="/root/model/tiny-llama-1.1B", 
+        help="path to huggingface model dir"
+    )
+    parser.add_argument(
+        "--model", type=str, default="/root/model/tiny-llama-seq-1-key-256-int8.om", 
+        help="path to onnx or om model"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default="BoolQ"
+    )
+    
+    args = parser.parse_args()
+    cfg = InferenceConfig(
+        hf_model_dir=args.hf_dir,
+        model=args.model,
+        max_cache_size=args.kv_size,
+        sampling_method=args.sampling,
+        sampling_value=args.sampling_value,
+        temperature=args.temperature,
+        session_type=args.engine,
+    )
+    engine = LlamaInterface(cfg)
+    main(args.cli,engine,args.dataset)
\ No newline at end of file