diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
new file mode 100644
index 0000000000000000000000000000000000000000..2d7f4c35f4f18b7385598e9a5e4c2d3e3cc272d5
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
@@ -0,0 +1,30 @@
+BSD 3-Clause License
+
+
+Copyright (c) 2017,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e6307976660a136161c1e763f2f810eb9651d7ad
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
@@ -0,0 +1,42 @@
+diff -uNr ascend-llm/export_llama/export_llama.py ascend-llm-qwen/export_llama/export_llama.py
+--- ascend-llm/export_llama/export_llama.py 2024-09-05 15:10:55.831311000 +0800
++++ ascend-llm-qwen/export_llama/export_llama.py 2024-09-05 15:20:09.720307600 +0800
+@@ -2,16 +2,17 @@
+ import importlib
+ import torch
+ import os
+-from transformers import LlamaForCausalLM, LlamaTokenizer
+-
++from transformers import Qwen2ForCausalLM, Qwen2Tokenizer
++import torch_npu
++from torch_npu.contrib import transfer_to_npu
++torch_npu.npu.set_device("npu:3")
+
+ def export_onnx(base_model,out_path,quant_cfg_path,act_path):
+- tokenizer= LlamaTokenizer.from_pretrained(base_model)
+- model = LlamaForCausalLM.from_pretrained(
++ tokenizer= Qwen2Tokenizer.from_pretrained(base_model)
++ model = Qwen2ForCausalLM.from_pretrained(
+ base_model,
+ torch_dtype=torch.float16,
+- device_map="auto",
+- )
++ ).npu()
+ model_cfg=model.model.config
+ spec = importlib.util.spec_from_file_location("quant_cfg_module", quant_cfg_path)
+ quant_cfg_module = importlib.util.module_from_spec(spec)
+diff -uNr ascend-llm/inference/config.py ascend-llm-qwen/inference/config.py
+--- ascend-llm/inference/config.py 2024-09-05 15:10:55.833305200 +0800
++++ ascend-llm-qwen/inference/config.py 2024-09-05 15:20:36.210316200 +0800
+@@ -28,9 +28,9 @@
+ head_len:int= 32 # 在KVCache evict时前head len会被保留
+ recent_len:int = 32 # 在KVCache evict时最近recent len会被保留
+ evict_len:int = 64 # KVCache 逐出的最小值,当KVCache达到最大值时将逐出evict_len个KVCache
+- n_layer:int = 22
++ n_layer:int = 28
+ format:str='huggingface-tensor' #KVcache的格式
+- max_cache_size:int=256 # kvcache的最大长度
++ max_cache_size:int=1024 # kvcache的最大长度
+ head_num:int=4
+ num_kv_group:int = 8 # for GQA
+ head_dim:int=64
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
new file mode 100644
index 0000000000000000000000000000000000000000..84e6d03e70ab2c18f31f8fdd827b500842d336ab
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
@@ -0,0 +1,70 @@
+--- modeling_qwen2.py 2024-09-04 22:30:47.490111800 +0800
++++ modeling_qwen2_export.py 2024-09-04 22:49:20.540908500 +0800
+@@ -162,6 +162,10 @@
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
++ return(
++ self.cos_cached.to(dtype=x.dtype),
++ self.sin_cached.to(dtype=x.dtype),
++ )
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+@@ -312,6 +316,7 @@
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
++ out_key_value = (key_states, value_states) if use_cache else None
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+@@ -351,7 +356,7 @@
+ if not output_attentions:
+ attn_weights = None
+
+- return attn_output, attn_weights, past_key_value
++ return attn_output, attn_weights, out_key_value
+
+
+ class Qwen2FlashAttention2(Qwen2Attention):
+@@ -895,7 +900,7 @@
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+- next_decoder_cache = None
++ next_decoder_cache = [] if use_cache else None
+
+ for decoder_layer in self.layers:
+ if output_hidden_states:
+@@ -926,7 +931,11 @@
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+- next_decoder_cache = layer_outputs[2 if output_attentions else 1]
++ key_values = layer_outputs[2 if output_attentions else 1]
++ if isinstance(next_decoder_cache,tuple):
++ next_decoder_cache=list(next_decoder_cache)
++ assert isinstance(next_decoder_cache, list),"transform failed"
++ next_decoder_cache.extend(layer_outputs[2 if output_hidden_states else 1])
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+@@ -937,9 +946,7 @@
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+- next_cache = None
+- if use_cache:
+- next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
++ next_cache = torch.extend(next_decoder_cache).reshape(len(self.layers),2,*next_decoder_cache[0].shape) if use_cache else None
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+@@ -1433,4 +1440,4 @@
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+- )
+\ No newline at end of file
++ )
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0147f8fdf5c24c2765495bedf1ad9b3edf0307d9
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
@@ -0,0 +1,58 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2017,
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import onnx
+import onnx.helper as helper
+from onnx import TensorProtos
+
+model = onnx.load("qwen2.onnx")
+new_nodes = []
+
+for node in model.graph.node:
+ new_nodes = node
+ if node.op_tyoe == "Trilu":
+ new_node = helper.make_node(
+ "Trilu",
+ inputs=[node.input[0]],
+ outputs=node.output,
+ upper=1
+ )
+ new_nodes.append(new_node)
+
+new_graph = helper.make_graph(
+ new_nodes,
+ "new_graph",
+ inputs=model.graph.input,
+ outputs=model.graph.output,
+ value_info=model.graph.value_info,
+ initializer=model.graph.initializer
+)
+
+new_model = helper.make_model(new_graph, producer_name=model.producer_name, opset_imports=model.opset_import, ir_version=model.ir_version)
+onnx.save(new_model, "qwen2.onnx", save_as_external_data=True)
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..6964a3f5b9ebcdd7b6e4516360f2244f1bbca8b0
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
@@ -0,0 +1,201 @@
+# README
+
+-此README对qwen2-7b 在310b环境离线模型导出与推理脚本及其使用方式进行介绍
+
+- [概述](#ZH-CN_TOPIC_0000001172161501)
+
+ - [输入输出数据](#section540883920406)
+
+- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
+
+- [快速上手](#ZH-CN_TOPIC_0000001126281700)
+
+ - [获取源码](#section4622531142816)
+ - [模型转换与推理](#section741711594517)
+
+
+# 概述
+
+通义千问是阿里云自主研发的超大规模语言模型,能够回答问题、创作文字,还能表达观点、撰写代码。
+
+## 输入输出数据
+
+ 输入数据
+
+ | 输入数据 | 数据类型 | 大小 | 数据排布格式 |
+ | --------------- | -------- | ------------------------------------ | ------------ |
+ | input_ids | int64 | 1 x 1 | ND |
+ | attention_mask | int64 | 1 x 1 | ND |
+ | position_ids | int64 | 1 x 1 | ND |
+ | past_key_values | int64 | layers,2,1,n_heads, kv_len, head_dim | ND |
+
+- 输出数据
+
+ | 输出数据 | 数据类型 | 大小 | 数据排布格式 |
+ | -------------- | -------- | ----------------------------- | ------------ |
+ | logits | FLOAT32 | 1 x vocab_size | ND |
+ | out_key_values | FLOAT16 | layers,2,1,36,kv_len,head_dim | ND |
+
+# 推理环境准备
+
+ **表 1** 版本配套表
+
+ | 配套 | 版本 | 取包地址环境准备指导 |
+ | ---------------------------------------------------- | ------------------- | --------------------------------------------------------------------------------------------------------------- |
+ | 固件与驱动 | Ascend HDK 24.1.RC3 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) |
+ | CANN | CANN 8.0.RC3 | https://cmc-szv.clouddragon.huawei.com/cmcversion/index/releaseView?deltaId=10860207193326848&isSelect=Software |
+ | Python | 3.9.19 | - |
+ | PyTorch | 2.1.0 | - |
+ | 说明:310B推理卡请以CANN版本选择实际固件与驱动版本。 | \ |
+
+# 快速上手
+
+## 获取源码
+
+1. 获取源码。
+
+ ```
+ # 获取源码 commitId :1392d7f 此代码库已经不更新,可以按下面取最新版
+ git clone https://gitee.com/yinghuo302/ascend-llm
+ cd ascend-llm
+ patch -p0 < diff.patch
+ cp $install_python_path/lib/site-packages/transformers/models/qwen2/modeling_qwen2.py .
+ patch -p0 < diff_model.patch
+ cp modeling_qwen2.py $install_python_path/lib/site-packages/transformers/models/qwen2
+ ```
+
+2. 安装依赖。
+
+ ```
+ pip install -r requirements.txt
+ ```
+
+## 模型转换与推理
+
+1. 环境搭建。
+
+ - protoc安装
+
+ 根据昇腾文档选择合适的protoc,此版本配套使用的protoc版本最低为 1.13.0
+ 进入https://github.com/protocolbuffers/protobuf/releases下载对应版本
+ ```
+ # 安装protoc==1.13.0, 找一空闲目录下载
+ tar -zxvf protobuf-all-3.13.0.tar.gz
+ cd protobuf-3.13.0
+ apt-get update
+ apt-get install autoconf automake libtool
+ ./autogen.sh
+ ./configure
+ make -j4
+ make install
+ sudo ldconfig
+ protoc --version # 查看版本号
+ ```
+
+ - 算子编译部署
+ ```
+ # 将./custom_op/matmul_integer_plugin.cc 拷贝到指定路径
+ cd MiniCPM_for_Pytorch
+ export ASCEND_PATH=/usr/local/Ascend/ascend-toolkit/latest
+ cp custom_op/matmul_integer_plugin.cc $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx/framework/onnx_plugin/
+ cd $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx
+ ```
+ 打开build.sh,找到下面四个环境变量,解开注释并修改如下:
+ ```
+ export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/include
+ export TOOLCHAIN_DIR=/usr
+ export AICPU_KERNEL_TARGET=cust_aicpu_kernels
+ export AICPU_SOC_VERSION=Ascend310B4
+ ```
+ - 编译运行
+ ```
+ ./build.sh
+ cd build_out/
+ ./custom_opp_ubuntu_aarch64.run
+ # 生成文件到customize到默认目录 $ASCEND_PATH/opp/vendors/,删除冗余文件
+ cd $ASCEND_PATH/opp/vendors/customize
+ rm -rf op_impl/ op_proto/
+ ```
+
+2. 模型转换(进入export_llama目录)。
+
+
+ 1). 导出onnx模型。
+
+
+ python export_llama.py --model ${模型文件路径} --output ${输出onnx文件路径}
+
+
+- 参数说明:
+ - model_name: 模型名称
+ - model_type: 模型类型
+ - save_path: 模型权重保存文件夹
+
+
+ 2). 使用ATC工具将ONNX模型转OM模型。
+
+1. 配置环境变量。
+
+
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+
+2. 执行命令查看芯片名称($\{chip\_name\})。
+
+
+ npu-smi info
+
+#该设备芯片名为Ascend310P3 (自行替换)
+会显如下:
+
+ +-------------------+-----------------+------------------------------------------------------+
+ | NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page) |
+ | Chip Device | Bus-Id | AICore(%) Memory-Usage(MB) |
+ +===================+=================+======================================================+
+ | 0 310P3 | OK | 15.8 42 0 / 0 |
+ | 0 0 | 0000:82:00.0 | 0 1074 / 21534 |
+ +===================+=================+======================================================+
+ | 1 310P3 | OK | 15.4 43 0 / 0 |
+ | 0 1 | 0000:89:00.0 | 0 1070 / 21534 |
+ +===================+=================+======================================================+
+ ```
+
+3. 执行ATC命令。
+
+ atc --framework=5 --model=${onnx文件路径} --output=${输出文件名} --input_format=ND --input_shape="input_ids:1,1;attention_mask:1,1025;position_ids:1,1;past_key_values:28,2,1,4,1024,128" --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype
+
+
+- 参数说明:
+
+ - model:为ONNX模型文件。
+ - framework:5代表ONNX模型。
+ - output:输出的OM模型。
+ - input\_format:输入数据的格式。
+ - input\_shape:输入数据的shape。
+ - log:日志级别。
+ - soc\_version:处理器型号。
+
+
+ 运行成功后生成om后缀的模型文件。
+
+3. 开始推理验证。
+ 1). 执行推理前准备工作:
+ A)在端侧设备上如310B1 上安装对应cann,驱动等
+ B)进入inference, 安装相关依赖 pip install -r requirements.txt
+
+ 2). 执行推理:
+
+ python main.py --model ${om文件路径} --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli
+
+- 参数说明:
+ - model:om模型路径
+ - hf-dir:需要tokenizer和模型配置文件,权重不需要
+ - engine:310B上只能acl
+ - sampling:greedy/top_p/top_k
+ - cli:表示在终端运行
+ 说明: 上面参数根据实际情况修改
+
+3.数据集精度验证:
+ 先下载CEval,BoolQ,GSM8K数据集到inference目录下,具体路径为./inference/dataset,将test.py文件放到inference路径下
+
+ python test.py --model ${om文件路径} --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli --dataset=BoolQ/CEval/GSM8K
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16613f21193dbbb8e5bdb825a17d6efff8b12d38
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
@@ -0,0 +1,5 @@
+torch==2.1.0
+torch_npu
+onnx
+transformers==4.38.2
+lm-eval==0.4.2 # for eval
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b285cecce8d4a6fb52c7012d86eba3d0dbfec89d
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
@@ -0,0 +1,91 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2017,
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+from config import InferrnceConfig
+from inference import LlamaInterface
+
+def main(cli: bool, engine: LlamaInterface, dataset):
+ if cli:
+ if dataset == 'BoolQ':
+ engine.test_boolq()
+ elif dataset == 'CEval':
+ engine.test_ceval()
+ elif dataset == 'GSM8K':
+ engine.test_gsm8k()
+ else:
+ print("dataset is not support! ")
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--cli', dest='cli', default=False, action='store_true',
+ help="run web ui by default, if add --cli, run cli."
+ )
+ parser.add_argument("--kv_size", type=int, default=1024)
+ parser.add_argument(
+ "--engine", type=str, default="acl",
+ help="inference backend, onnx or acl"
+ )
+ parser.add_argument(
+ "--sampling", type=str, default="top_k",
+ help="sampling method, greedy, top_k or top_p"
+ )
+ parser.add_argument(
+ "--sampling_value", type=float,default=10,
+ help="if sampling method is seted to greedy, this argument will be ignored; if top_k, it means value of p; if top_p, it means value of p"
+ )
+ parser.add_argument(
+ "--temperature", type=float,default=0.7,
+ help="sampling temperature if sampling method is seted to greedy, this argument will be ignored."
+ )
+ parser.add_argument(
+ "--hf-dir", type=str, default="/root/model/tiny-llama-1.1B",
+ help="path to huggingface model dir"
+ )
+ parser.add_argument(
+ "--model", type=str, default="/root/model/tiny-llama-seq-1-key-256-int8.om",
+ help="path to onnx or om model"
+ )
+ parser.add_argument(
+ "--dataset", type=str, default="BoolQ"
+ )
+
+ args = parser.parse_args()
+ cfg = InferenceConfig(
+ hf_model_dir=args.hf_dir,
+ model=args.model,
+ max_cache_size=args.kv_size,
+ sampling_method=args.sampling,
+ sampling_value=args.sampling_value,
+ temperature=args.temperature,
+ session_type=args.engine,
+ )
+ engine = LlamaInterface(cfg)
+ main(args.cli,engine,args.dataset)
\ No newline at end of file