diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE new file mode 100644 index 0000000000000000000000000000000000000000..2d7f4c35f4f18b7385598e9a5e4c2d3e3cc272d5 --- /dev/null +++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE @@ -0,0 +1,30 @@ +BSD 3-Clause License + + +Copyright (c) 2017, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch new file mode 100644 index 0000000000000000000000000000000000000000..e6307976660a136161c1e763f2f810eb9651d7ad --- /dev/null +++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch @@ -0,0 +1,42 @@ +diff -uNr ascend-llm/export_llama/export_llama.py ascend-llm-qwen/export_llama/export_llama.py +--- ascend-llm/export_llama/export_llama.py 2024-09-05 15:10:55.831311000 +0800 ++++ ascend-llm-qwen/export_llama/export_llama.py 2024-09-05 15:20:09.720307600 +0800 +@@ -2,16 +2,17 @@ + import importlib + import torch + import os +-from transformers import LlamaForCausalLM, LlamaTokenizer +- ++from transformers import Qwen2ForCausalLM, Qwen2Tokenizer ++import torch_npu ++from torch_npu.contrib import transfer_to_npu ++torch_npu.npu.set_device("npu:3") + + def export_onnx(base_model,out_path,quant_cfg_path,act_path): +- tokenizer= LlamaTokenizer.from_pretrained(base_model) +- model = LlamaForCausalLM.from_pretrained( ++ tokenizer= Qwen2Tokenizer.from_pretrained(base_model) ++ model = Qwen2ForCausalLM.from_pretrained( + base_model, + torch_dtype=torch.float16, +- device_map="auto", +- ) ++ ).npu() + model_cfg=model.model.config + spec = importlib.util.spec_from_file_location("quant_cfg_module", quant_cfg_path) + quant_cfg_module = importlib.util.module_from_spec(spec) +diff -uNr ascend-llm/inference/config.py ascend-llm-qwen/inference/config.py +--- ascend-llm/inference/config.py 2024-09-05 15:10:55.833305200 +0800 ++++ ascend-llm-qwen/inference/config.py 2024-09-05 15:20:36.210316200 +0800 +@@ -28,9 +28,9 @@ + head_len:int= 32 # 在KVCache evict时前head len会被保留 + recent_len:int = 32 # 在KVCache evict时最近recent len会被保留 + evict_len:int = 64 # KVCache 逐出的最小值,当KVCache达到最大值时将逐出evict_len个KVCache +- n_layer:int = 22 ++ n_layer:int = 28 + format:str='huggingface-tensor' #KVcache的格式 +- max_cache_size:int=256 # kvcache的最大长度 ++ max_cache_size:int=1024 # kvcache的最大长度 + head_num:int=4 + num_kv_group:int = 8 # for GQA + head_dim:int=64 \ No newline at end of file diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch new file mode 100644 index 0000000000000000000000000000000000000000..84e6d03e70ab2c18f31f8fdd827b500842d336ab --- /dev/null +++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch @@ -0,0 +1,70 @@ +--- modeling_qwen2.py 2024-09-04 22:30:47.490111800 +0800 ++++ modeling_qwen2_export.py 2024-09-04 22:49:20.540908500 +0800 +@@ -162,6 +162,10 @@ + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): ++ return( ++ self.cos_cached.to(dtype=x.dtype), ++ self.sin_cached.to(dtype=x.dtype), ++ ) + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) +@@ -312,6 +316,7 @@ + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + ++ out_key_value = (key_states, value_states) if use_cache else None + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) +@@ -351,7 +356,7 @@ + if not output_attentions: + attn_weights = None + +- return attn_output, attn_weights, past_key_value ++ return attn_output, attn_weights, out_key_value + + + class Qwen2FlashAttention2(Qwen2Attention): +@@ -895,7 +900,7 @@ + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None +- next_decoder_cache = None ++ next_decoder_cache = [] if use_cache else None + + for decoder_layer in self.layers: + if output_hidden_states: +@@ -926,7 +931,11 @@ + hidden_states = layer_outputs[0] + + if use_cache: +- next_decoder_cache = layer_outputs[2 if output_attentions else 1] ++ key_values = layer_outputs[2 if output_attentions else 1] ++ if isinstance(next_decoder_cache,tuple): ++ next_decoder_cache=list(next_decoder_cache) ++ assert isinstance(next_decoder_cache, list),"transform failed" ++ next_decoder_cache.extend(layer_outputs[2 if output_hidden_states else 1]) + + if output_attentions: + all_self_attns += (layer_outputs[1],) +@@ -937,9 +946,7 @@ + if output_hidden_states: + all_hidden_states += (hidden_states,) + +- next_cache = None +- if use_cache: +- next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache ++ next_cache = torch.extend(next_decoder_cache).reshape(len(self.layers),2,*next_decoder_cache[0].shape) if use_cache else None + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) +@@ -1433,4 +1440,4 @@ + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, +- ) +\ No newline at end of file ++ ) diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py new file mode 100644 index 0000000000000000000000000000000000000000..0147f8fdf5c24c2765495bedf1ad9b3edf0307d9 --- /dev/null +++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License + +# Copyright (c) 2017, +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import onnx +import onnx.helper as helper +from onnx import TensorProtos + +model = onnx.load("qwen2.onnx") +new_nodes = [] + +for node in model.graph.node: + new_nodes = node + if node.op_tyoe == "Trilu": + new_node = helper.make_node( + "Trilu", + inputs=[node.input[0]], + outputs=node.output, + upper=1 + ) + new_nodes.append(new_node) + +new_graph = helper.make_graph( + new_nodes, + "new_graph", + inputs=model.graph.input, + outputs=model.graph.output, + value_info=model.graph.value_info, + initializer=model.graph.initializer +) + +new_model = helper.make_model(new_graph, producer_name=model.producer_name, opset_imports=model.opset_import, ir_version=model.ir_version) +onnx.save(new_model, "qwen2.onnx", save_as_external_data=True) \ No newline at end of file diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..6964a3f5b9ebcdd7b6e4516360f2244f1bbca8b0 --- /dev/null +++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md @@ -0,0 +1,201 @@ +# README + +-此README对qwen2-7b 在310b环境离线模型导出与推理脚本及其使用方式进行介绍 + +- [概述](#ZH-CN_TOPIC_0000001172161501) + + - [输入输出数据](#section540883920406) + +- [推理环境准备](#ZH-CN_TOPIC_0000001126281702) + +- [快速上手](#ZH-CN_TOPIC_0000001126281700) + + - [获取源码](#section4622531142816) + - [模型转换与推理](#section741711594517) + + +# 概述 + +通义千问是阿里云自主研发的超大规模语言模型,能够回答问题、创作文字,还能表达观点、撰写代码。 + +## 输入输出数据 + + 输入数据 + + | 输入数据 | 数据类型 | 大小 | 数据排布格式 | + | --------------- | -------- | ------------------------------------ | ------------ | + | input_ids | int64 | 1 x 1 | ND | + | attention_mask | int64 | 1 x 1 | ND | + | position_ids | int64 | 1 x 1 | ND | + | past_key_values | int64 | layers,2,1,n_heads, kv_len, head_dim | ND | + +- 输出数据 + + | 输出数据 | 数据类型 | 大小 | 数据排布格式 | + | -------------- | -------- | ----------------------------- | ------------ | + | logits | FLOAT32 | 1 x vocab_size | ND | + | out_key_values | FLOAT16 | layers,2,1,36,kv_len,head_dim | ND | + +# 推理环境准备 + + **表 1** 版本配套表 + + | 配套 | 版本 | 取包地址环境准备指导 | + | ---------------------------------------------------- | ------------------- | --------------------------------------------------------------------------------------------------------------- | + | 固件与驱动 | Ascend HDK 24.1.RC3 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | + | CANN | CANN 8.0.RC3 | https://cmc-szv.clouddragon.huawei.com/cmcversion/index/releaseView?deltaId=10860207193326848&isSelect=Software | + | Python | 3.9.19 | - | + | PyTorch | 2.1.0 | - | + | 说明:310B推理卡请以CANN版本选择实际固件与驱动版本。 | \ | + +# 快速上手 + +## 获取源码 + +1. 获取源码。 + + ``` + # 获取源码 commitId :1392d7f 此代码库已经不更新,可以按下面取最新版 + git clone https://gitee.com/yinghuo302/ascend-llm + cd ascend-llm + patch -p0 < diff.patch + cp $install_python_path/lib/site-packages/transformers/models/qwen2/modeling_qwen2.py . + patch -p0 < diff_model.patch + cp modeling_qwen2.py $install_python_path/lib/site-packages/transformers/models/qwen2 + ``` + +2. 安装依赖。 + + ``` + pip install -r requirements.txt + ``` + +## 模型转换与推理 + +1. 环境搭建。 + + - protoc安装 + + 根据昇腾文档选择合适的protoc,此版本配套使用的protoc版本最低为 1.13.0 + 进入https://github.com/protocolbuffers/protobuf/releases下载对应版本 + ``` + # 安装protoc==1.13.0, 找一空闲目录下载 + tar -zxvf protobuf-all-3.13.0.tar.gz + cd protobuf-3.13.0 + apt-get update + apt-get install autoconf automake libtool + ./autogen.sh + ./configure + make -j4 + make install + sudo ldconfig + protoc --version # 查看版本号 + ``` + + - 算子编译部署 + ``` + # 将./custom_op/matmul_integer_plugin.cc 拷贝到指定路径 + cd MiniCPM_for_Pytorch + export ASCEND_PATH=/usr/local/Ascend/ascend-toolkit/latest + cp custom_op/matmul_integer_plugin.cc $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx/framework/onnx_plugin/ + cd $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx + ``` + 打开build.sh,找到下面四个环境变量,解开注释并修改如下: + ``` + export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/include + export TOOLCHAIN_DIR=/usr + export AICPU_KERNEL_TARGET=cust_aicpu_kernels + export AICPU_SOC_VERSION=Ascend310B4 + ``` + - 编译运行 + ``` + ./build.sh + cd build_out/ + ./custom_opp_ubuntu_aarch64.run + # 生成文件到customize到默认目录 $ASCEND_PATH/opp/vendors/,删除冗余文件 + cd $ASCEND_PATH/opp/vendors/customize + rm -rf op_impl/ op_proto/ + ``` + +2. 模型转换(进入export_llama目录)。 + + + 1). 导出onnx模型。 + + + python export_llama.py --model ${模型文件路径} --output ${输出onnx文件路径} + + +- 参数说明: + - model_name: 模型名称 + - model_type: 模型类型 + - save_path: 模型权重保存文件夹 + + + 2). 使用ATC工具将ONNX模型转OM模型。 + +1. 配置环境变量。 + + + source /usr/local/Ascend/ascend-toolkit/set_env.sh + + +2. 执行命令查看芯片名称($\{chip\_name\})。 + + + npu-smi info + +#该设备芯片名为Ascend310P3 (自行替换) +会显如下: + + +-------------------+-----------------+------------------------------------------------------+ + | NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page) | + | Chip Device | Bus-Id | AICore(%) Memory-Usage(MB) | + +===================+=================+======================================================+ + | 0 310P3 | OK | 15.8 42 0 / 0 | + | 0 0 | 0000:82:00.0 | 0 1074 / 21534 | + +===================+=================+======================================================+ + | 1 310P3 | OK | 15.4 43 0 / 0 | + | 0 1 | 0000:89:00.0 | 0 1070 / 21534 | + +===================+=================+======================================================+ + ``` + +3. 执行ATC命令。 + + atc --framework=5 --model=${onnx文件路径} --output=${输出文件名} --input_format=ND --input_shape="input_ids:1,1;attention_mask:1,1025;position_ids:1,1;past_key_values:28,2,1,4,1024,128" --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype + + +- 参数说明: + + - model:为ONNX模型文件。 + - framework:5代表ONNX模型。 + - output:输出的OM模型。 + - input\_format:输入数据的格式。 + - input\_shape:输入数据的shape。 + - log:日志级别。 + - soc\_version:处理器型号。 + + + 运行成功后生成om后缀的模型文件。 + +3. 开始推理验证。 + 1). 执行推理前准备工作: + A)在端侧设备上如310B1 上安装对应cann,驱动等 + B)进入inference, 安装相关依赖 pip install -r requirements.txt + + 2). 执行推理: + + python main.py --model ${om文件路径} --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli + +- 参数说明: + - model:om模型路径 + - hf-dir:需要tokenizer和模型配置文件,权重不需要 + - engine:310B上只能acl + - sampling:greedy/top_p/top_k + - cli:表示在终端运行 + 说明: 上面参数根据实际情况修改 + +3.数据集精度验证: + 先下载CEval,BoolQ,GSM8K数据集到inference目录下,具体路径为./inference/dataset,将test.py文件放到inference路径下 + + python test.py --model ${om文件路径} --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli --dataset=BoolQ/CEval/GSM8K \ No newline at end of file diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..16613f21193dbbb8e5bdb825a17d6efff8b12d38 --- /dev/null +++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt @@ -0,0 +1,5 @@ +torch==2.1.0 +torch_npu +onnx +transformers==4.38.2 +lm-eval==0.4.2 # for eval diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py new file mode 100644 index 0000000000000000000000000000000000000000..b285cecce8d4a6fb52c7012d86eba3d0dbfec89d --- /dev/null +++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py @@ -0,0 +1,91 @@ +# BSD 3-Clause License + +# Copyright (c) 2017, +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse +from config import InferrnceConfig +from inference import LlamaInterface + +def main(cli: bool, engine: LlamaInterface, dataset): + if cli: + if dataset == 'BoolQ': + engine.test_boolq() + elif dataset == 'CEval': + engine.test_ceval() + elif dataset == 'GSM8K': + engine.test_gsm8k() + else: + print("dataset is not support! ") + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--cli', dest='cli', default=False, action='store_true', + help="run web ui by default, if add --cli, run cli." + ) + parser.add_argument("--kv_size", type=int, default=1024) + parser.add_argument( + "--engine", type=str, default="acl", + help="inference backend, onnx or acl" + ) + parser.add_argument( + "--sampling", type=str, default="top_k", + help="sampling method, greedy, top_k or top_p" + ) + parser.add_argument( + "--sampling_value", type=float,default=10, + help="if sampling method is seted to greedy, this argument will be ignored; if top_k, it means value of p; if top_p, it means value of p" + ) + parser.add_argument( + "--temperature", type=float,default=0.7, + help="sampling temperature if sampling method is seted to greedy, this argument will be ignored." + ) + parser.add_argument( + "--hf-dir", type=str, default="/root/model/tiny-llama-1.1B", + help="path to huggingface model dir" + ) + parser.add_argument( + "--model", type=str, default="/root/model/tiny-llama-seq-1-key-256-int8.om", + help="path to onnx or om model" + ) + parser.add_argument( + "--dataset", type=str, default="BoolQ" + ) + + args = parser.parse_args() + cfg = InferenceConfig( + hf_model_dir=args.hf_dir, + model=args.model, + max_cache_size=args.kv_size, + sampling_method=args.sampling, + sampling_value=args.sampling_value, + temperature=args.temperature, + session_type=args.engine, + ) + engine = LlamaInterface(cfg) + main(args.cli,engine,args.dataset) \ No newline at end of file