From 8bc28b62b8c375bb9be8a040111b899fd3922f2b Mon Sep 17 00:00:00 2001
From: linsicong <12378173+linsicong97_admin@user.noreply.gitee.com>
Date: Wed, 4 Sep 2024 16:16:30 +0000
Subject: [PATCH 1/9] add qwen

Signed-off-by: linsicong <12378173+linsicong97_admin@user.noreply.gitee.com>
---
 .../built-in/nlp/Qwen_for_Pytorch/LICENCE     |   29 +
 .../built-in/nlp/Qwen_for_Pytorch/diff.patch  | 4458 +++++++++++++++++
 .../nlp/Qwen_for_Pytorch/diff_model.patch     |   70 +
 .../nlp/Qwen_for_Pytorch/export_Trilu.py      |   29 +
 .../built-in/nlp/Qwen_for_Pytorch/readme.md   |   38 +
 .../nlp/Qwen_for_Pytorch/requirements.txt     |    5 +
 .../built-in/nlp/Qwen_for_Pytorch/test.py     |   62 +
 7 files changed, 4691 insertions(+)
 create mode 100644 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
 create mode 100644 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
 create mode 100644 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
 create mode 100644 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
 create mode 100644 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
 create mode 100644 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
 create mode 100644 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
new file mode 100644
index 0000000000..db05a35866
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
new file mode 100644
index 0000000000..7e1fe724a0
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
@@ -0,0 +1,4458 @@
+diff -uNr ascend-llm/.git/HEAD ascend-llm-qwen/.git/HEAD
+--- ascend-llm/.git/HEAD	2024-09-04 19:28:06.528235700 +0800
++++ ascend-llm-qwen/.git/HEAD	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-ref: refs/heads/lscno2
+diff -uNr ascend-llm/.git/config ascend-llm-qwen/.git/config
+--- ascend-llm/.git/config	2024-09-04 19:29:48.493570100 +0800
++++ ascend-llm-qwen/.git/config	1970-01-01 08:00:00.000000000 +0800
+@@ -1,15 +0,0 @@
+-[core]
+-	repositoryformatversion = 0
+-	filemode = false
+-	bare = false
+-	logallrefupdates = true
+-	symlinks = false
+-	ignorecase = true
+-[remote "origin"]
+-	url = https://gitee.com/yinghuo302/ascend-llm.git
+-	fetch = +refs/heads/*:refs/remotes/origin/*
+-[branch "main"]
+-	remote = origin
+-	merge = refs/heads/main
+-[branch "lscno2"]
+-	vscode-merge-base = origin/main
+diff -uNr ascend-llm/.git/description ascend-llm-qwen/.git/description
+--- ascend-llm/.git/description	2024-09-04 19:20:58.889995300 +0800
++++ ascend-llm-qwen/.git/description	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-Unnamed repository; edit this file 'description' to name the repository.
+diff -uNr ascend-llm/.git/hooks/applypatch-msg.sample ascend-llm-qwen/.git/hooks/applypatch-msg.sample
+--- ascend-llm/.git/hooks/applypatch-msg.sample	2024-09-04 19:20:58.889995300 +0800
++++ ascend-llm-qwen/.git/hooks/applypatch-msg.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,15 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to check the commit log message taken by
+-# applypatch from an e-mail message.
+-#
+-# The hook should exit with non-zero status after issuing an
+-# appropriate message if it wants to stop the commit.  The hook is
+-# allowed to edit the commit message file.
+-#
+-# To enable this hook, rename this file to "applypatch-msg".
+-
+-. git-sh-setup
+-commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
+-test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
+-:
+diff -uNr ascend-llm/.git/hooks/commit-msg.sample ascend-llm-qwen/.git/hooks/commit-msg.sample
+--- ascend-llm/.git/hooks/commit-msg.sample	2024-09-04 19:20:58.889995300 +0800
++++ ascend-llm-qwen/.git/hooks/commit-msg.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,24 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to check the commit log message.
+-# Called by "git commit" with one argument, the name of the file
+-# that has the commit message.  The hook should exit with non-zero
+-# status after issuing an appropriate message if it wants to stop the
+-# commit.  The hook is allowed to edit the commit message file.
+-#
+-# To enable this hook, rename this file to "commit-msg".
+-
+-# Uncomment the below to add a Signed-off-by line to the message.
+-# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
+-# hook is more suited to it.
+-#
+-# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
+-# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
+-
+-# This example catches duplicate Signed-off-by lines.
+-
+-test "" = "$(grep '^Signed-off-by: ' "$1" |
+-	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
+-	echo >&2 Duplicate Signed-off-by lines.
+-	exit 1
+-}
+diff -uNr ascend-llm/.git/hooks/fsmonitor-watchman.sample ascend-llm-qwen/.git/hooks/fsmonitor-watchman.sample
+--- ascend-llm/.git/hooks/fsmonitor-watchman.sample	2024-09-04 19:20:58.893526000 +0800
++++ ascend-llm-qwen/.git/hooks/fsmonitor-watchman.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,174 +0,0 @@
+-#!/usr/bin/perl
+-
+-use strict;
+-use warnings;
+-use IPC::Open2;
+-
+-# An example hook script to integrate Watchman
+-# (https://facebook.github.io/watchman/) with git to speed up detecting
+-# new and modified files.
+-#
+-# The hook is passed a version (currently 2) and last update token
+-# formatted as a string and outputs to stdout a new update token and
+-# all files that have been modified since the update token. Paths must
+-# be relative to the root of the working tree and separated by a single NUL.
+-#
+-# To enable this hook, rename this file to "query-watchman" and set
+-# 'git config core.fsmonitor .git/hooks/query-watchman'
+-#
+-my ($version, $last_update_token) = @ARGV;
+-
+-# Uncomment for debugging
+-# print STDERR "$0 $version $last_update_token\n";
+-
+-# Check the hook interface version
+-if ($version ne 2) {
+-	die "Unsupported query-fsmonitor hook version '$version'.\n" .
+-	    "Falling back to scanning...\n";
+-}
+-
+-my $git_work_tree = get_working_dir();
+-
+-my $retry = 1;
+-
+-my $json_pkg;
+-eval {
+-	require JSON::XS;
+-	$json_pkg = "JSON::XS";
+-	1;
+-} or do {
+-	require JSON::PP;
+-	$json_pkg = "JSON::PP";
+-};
+-
+-launch_watchman();
+-
+-sub launch_watchman {
+-	my $o = watchman_query();
+-	if (is_work_tree_watched($o)) {
+-		output_result($o->{clock}, @{$o->{files}});
+-	}
+-}
+-
+-sub output_result {
+-	my ($clockid, @files) = @_;
+-
+-	# Uncomment for debugging watchman output
+-	# open (my $fh, ">", ".git/watchman-output.out");
+-	# binmode $fh, ":utf8";
+-	# print $fh "$clockid\n@files\n";
+-	# close $fh;
+-
+-	binmode STDOUT, ":utf8";
+-	print $clockid;
+-	print "\0";
+-	local $, = "\0";
+-	print @files;
+-}
+-
+-sub watchman_clock {
+-	my $response = qx/watchman clock "$git_work_tree"/;
+-	die "Failed to get clock id on '$git_work_tree'.\n" .
+-		"Falling back to scanning...\n" if $? != 0;
+-
+-	return $json_pkg->new->utf8->decode($response);
+-}
+-
+-sub watchman_query {
+-	my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
+-	or die "open2() failed: $!\n" .
+-	"Falling back to scanning...\n";
+-
+-	# In the query expression below we're asking for names of files that
+-	# changed since $last_update_token but not from the .git folder.
+-	#
+-	# To accomplish this, we're using the "since" generator to use the
+-	# recency index to select candidate nodes and "fields" to limit the
+-	# output to file names only. Then we're using the "expression" term to
+-	# further constrain the results.
+-	my $last_update_line = "";
+-	if (substr($last_update_token, 0, 1) eq "c") {
+-		$last_update_token = "\"$last_update_token\"";
+-		$last_update_line = qq[\n"since": $last_update_token,];
+-	}
+-	my $query = <<"	END";
+-		["query", "$git_work_tree", {$last_update_line
+-			"fields": ["name"],
+-			"expression": ["not", ["dirname", ".git"]]
+-		}]
+-	END
+-
+-	# Uncomment for debugging the watchman query
+-	# open (my $fh, ">", ".git/watchman-query.json");
+-	# print $fh $query;
+-	# close $fh;
+-
+-	print CHLD_IN $query;
+-	close CHLD_IN;
+-	my $response = do {local $/; <CHLD_OUT>};
+-
+-	# Uncomment for debugging the watch response
+-	# open ($fh, ">", ".git/watchman-response.json");
+-	# print $fh $response;
+-	# close $fh;
+-
+-	die "Watchman: command returned no output.\n" .
+-	"Falling back to scanning...\n" if $response eq "";
+-	die "Watchman: command returned invalid output: $response\n" .
+-	"Falling back to scanning...\n" unless $response =~ /^\{/;
+-
+-	return $json_pkg->new->utf8->decode($response);
+-}
+-
+-sub is_work_tree_watched {
+-	my ($output) = @_;
+-	my $error = $output->{error};
+-	if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
+-		$retry--;
+-		my $response = qx/watchman watch "$git_work_tree"/;
+-		die "Failed to make watchman watch '$git_work_tree'.\n" .
+-		    "Falling back to scanning...\n" if $? != 0;
+-		$output = $json_pkg->new->utf8->decode($response);
+-		$error = $output->{error};
+-		die "Watchman: $error.\n" .
+-		"Falling back to scanning...\n" if $error;
+-
+-		# Uncomment for debugging watchman output
+-		# open (my $fh, ">", ".git/watchman-output.out");
+-		# close $fh;
+-
+-		# Watchman will always return all files on the first query so
+-		# return the fast "everything is dirty" flag to git and do the
+-		# Watchman query just to get it over with now so we won't pay
+-		# the cost in git to look up each individual file.
+-		my $o = watchman_clock();
+-		$error = $output->{error};
+-
+-		die "Watchman: $error.\n" .
+-		"Falling back to scanning...\n" if $error;
+-
+-		output_result($o->{clock}, ("/"));
+-		$last_update_token = $o->{clock};
+-
+-		eval { launch_watchman() };
+-		return 0;
+-	}
+-
+-	die "Watchman: $error.\n" .
+-	"Falling back to scanning...\n" if $error;
+-
+-	return 1;
+-}
+-
+-sub get_working_dir {
+-	my $working_dir;
+-	if ($^O =~ 'msys' || $^O =~ 'cygwin') {
+-		$working_dir = Win32::GetCwd();
+-		$working_dir =~ tr/\\/\//;
+-	} else {
+-		require Cwd;
+-		$working_dir = Cwd::cwd();
+-	}
+-
+-	return $working_dir;
+-}
+diff -uNr ascend-llm/.git/hooks/post-update.sample ascend-llm-qwen/.git/hooks/post-update.sample
+--- ascend-llm/.git/hooks/post-update.sample	2024-09-04 19:20:58.893526000 +0800
++++ ascend-llm-qwen/.git/hooks/post-update.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,8 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to prepare a packed repository for use over
+-# dumb transports.
+-#
+-# To enable this hook, rename this file to "post-update".
+-
+-exec git update-server-info
+diff -uNr ascend-llm/.git/hooks/pre-applypatch.sample ascend-llm-qwen/.git/hooks/pre-applypatch.sample
+--- ascend-llm/.git/hooks/pre-applypatch.sample	2024-09-04 19:20:58.893526000 +0800
++++ ascend-llm-qwen/.git/hooks/pre-applypatch.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,14 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to verify what is about to be committed
+-# by applypatch from an e-mail message.
+-#
+-# The hook should exit with non-zero status after issuing an
+-# appropriate message if it wants to stop the commit.
+-#
+-# To enable this hook, rename this file to "pre-applypatch".
+-
+-. git-sh-setup
+-precommit="$(git rev-parse --git-path hooks/pre-commit)"
+-test -x "$precommit" && exec "$precommit" ${1+"$@"}
+-:
+diff -uNr ascend-llm/.git/hooks/pre-commit.sample ascend-llm-qwen/.git/hooks/pre-commit.sample
+--- ascend-llm/.git/hooks/pre-commit.sample	2024-09-04 19:20:58.894662300 +0800
++++ ascend-llm-qwen/.git/hooks/pre-commit.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,49 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to verify what is about to be committed.
+-# Called by "git commit" with no arguments.  The hook should
+-# exit with non-zero status after issuing an appropriate message if
+-# it wants to stop the commit.
+-#
+-# To enable this hook, rename this file to "pre-commit".
+-
+-if git rev-parse --verify HEAD >/dev/null 2>&1
+-then
+-	against=HEAD
+-else
+-	# Initial commit: diff against an empty tree object
+-	against=$(git hash-object -t tree /dev/null)
+-fi
+-
+-# If you want to allow non-ASCII filenames set this variable to true.
+-allownonascii=$(git config --type=bool hooks.allownonascii)
+-
+-# Redirect output to stderr.
+-exec 1>&2
+-
+-# Cross platform projects tend to avoid non-ASCII filenames; prevent
+-# them from being added to the repository. We exploit the fact that the
+-# printable range starts at the space character and ends with tilde.
+-if [ "$allownonascii" != "true" ] &&
+-	# Note that the use of brackets around a tr range is ok here, (it's
+-	# even required, for portability to Solaris 10's /usr/bin/tr), since
+-	# the square bracket bytes happen to fall in the designated range.
+-	test $(git diff-index --cached --name-only --diff-filter=A -z $against |
+-	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
+-then
+-	cat <<\EOF
+-Error: Attempt to add a non-ASCII file name.
+-
+-This can cause problems if you want to work with people on other platforms.
+-
+-To be portable it is advisable to rename the file.
+-
+-If you know what you are doing you can disable this check using:
+-
+-  git config hooks.allownonascii true
+-EOF
+-	exit 1
+-fi
+-
+-# If there are whitespace errors, print the offending file names and fail.
+-exec git diff-index --check --cached $against --
+diff -uNr ascend-llm/.git/hooks/pre-merge-commit.sample ascend-llm-qwen/.git/hooks/pre-merge-commit.sample
+--- ascend-llm/.git/hooks/pre-merge-commit.sample	2024-09-04 19:20:58.894662300 +0800
++++ ascend-llm-qwen/.git/hooks/pre-merge-commit.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,13 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to verify what is about to be committed.
+-# Called by "git merge" with no arguments.  The hook should
+-# exit with non-zero status after issuing an appropriate message to
+-# stderr if it wants to stop the merge commit.
+-#
+-# To enable this hook, rename this file to "pre-merge-commit".
+-
+-. git-sh-setup
+-test -x "$GIT_DIR/hooks/pre-commit" &&
+-        exec "$GIT_DIR/hooks/pre-commit"
+-:
+diff -uNr ascend-llm/.git/hooks/pre-push.sample ascend-llm-qwen/.git/hooks/pre-push.sample
+--- ascend-llm/.git/hooks/pre-push.sample	2024-09-04 19:20:58.894662300 +0800
++++ ascend-llm-qwen/.git/hooks/pre-push.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,53 +0,0 @@
+-#!/bin/sh
+-
+-# An example hook script to verify what is about to be pushed.  Called by "git
+-# push" after it has checked the remote status, but before anything has been
+-# pushed.  If this script exits with a non-zero status nothing will be pushed.
+-#
+-# This hook is called with the following parameters:
+-#
+-# $1 -- Name of the remote to which the push is being done
+-# $2 -- URL to which the push is being done
+-#
+-# If pushing without using a named remote those arguments will be equal.
+-#
+-# Information about the commits which are being pushed is supplied as lines to
+-# the standard input in the form:
+-#
+-#   <local ref> <local oid> <remote ref> <remote oid>
+-#
+-# This sample shows how to prevent push of commits where the log message starts
+-# with "WIP" (work in progress).
+-
+-remote="$1"
+-url="$2"
+-
+-zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
+-
+-while read local_ref local_oid remote_ref remote_oid
+-do
+-	if test "$local_oid" = "$zero"
+-	then
+-		# Handle delete
+-		:
+-	else
+-		if test "$remote_oid" = "$zero"
+-		then
+-			# New branch, examine all commits
+-			range="$local_oid"
+-		else
+-			# Update to existing branch, examine new commits
+-			range="$remote_oid..$local_oid"
+-		fi
+-
+-		# Check for WIP commit
+-		commit=$(git rev-list -n 1 --grep '^WIP' "$range")
+-		if test -n "$commit"
+-		then
+-			echo >&2 "Found WIP commit in $local_ref, not pushing"
+-			exit 1
+-		fi
+-	fi
+-done
+-
+-exit 0
+diff -uNr ascend-llm/.git/hooks/pre-rebase.sample ascend-llm-qwen/.git/hooks/pre-rebase.sample
+--- ascend-llm/.git/hooks/pre-rebase.sample	2024-09-04 19:20:58.894662300 +0800
++++ ascend-llm-qwen/.git/hooks/pre-rebase.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,169 +0,0 @@
+-#!/bin/sh
+-#
+-# Copyright (c) 2006, 2008 Junio C Hamano
+-#
+-# The "pre-rebase" hook is run just before "git rebase" starts doing
+-# its job, and can prevent the command from running by exiting with
+-# non-zero status.
+-#
+-# The hook is called with the following parameters:
+-#
+-# $1 -- the upstream the series was forked from.
+-# $2 -- the branch being rebased (or empty when rebasing the current branch).
+-#
+-# This sample shows how to prevent topic branches that are already
+-# merged to 'next' branch from getting rebased, because allowing it
+-# would result in rebasing already published history.
+-
+-publish=next
+-basebranch="$1"
+-if test "$#" = 2
+-then
+-	topic="refs/heads/$2"
+-else
+-	topic=`git symbolic-ref HEAD` ||
+-	exit 0 ;# we do not interrupt rebasing detached HEAD
+-fi
+-
+-case "$topic" in
+-refs/heads/??/*)
+-	;;
+-*)
+-	exit 0 ;# we do not interrupt others.
+-	;;
+-esac
+-
+-# Now we are dealing with a topic branch being rebased
+-# on top of master.  Is it OK to rebase it?
+-
+-# Does the topic really exist?
+-git show-ref -q "$topic" || {
+-	echo >&2 "No such branch $topic"
+-	exit 1
+-}
+-
+-# Is topic fully merged to master?
+-not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
+-if test -z "$not_in_master"
+-then
+-	echo >&2 "$topic is fully merged to master; better remove it."
+-	exit 1 ;# we could allow it, but there is no point.
+-fi
+-
+-# Is topic ever merged to next?  If so you should not be rebasing it.
+-only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
+-only_next_2=`git rev-list ^master           ${publish} | sort`
+-if test "$only_next_1" = "$only_next_2"
+-then
+-	not_in_topic=`git rev-list "^$topic" master`
+-	if test -z "$not_in_topic"
+-	then
+-		echo >&2 "$topic is already up to date with master"
+-		exit 1 ;# we could allow it, but there is no point.
+-	else
+-		exit 0
+-	fi
+-else
+-	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
+-	/usr/bin/perl -e '
+-		my $topic = $ARGV[0];
+-		my $msg = "* $topic has commits already merged to public branch:\n";
+-		my (%not_in_next) = map {
+-			/^([0-9a-f]+) /;
+-			($1 => 1);
+-		} split(/\n/, $ARGV[1]);
+-		for my $elem (map {
+-				/^([0-9a-f]+) (.*)$/;
+-				[$1 => $2];
+-			} split(/\n/, $ARGV[2])) {
+-			if (!exists $not_in_next{$elem->[0]}) {
+-				if ($msg) {
+-					print STDERR $msg;
+-					undef $msg;
+-				}
+-				print STDERR " $elem->[1]\n";
+-			}
+-		}
+-	' "$topic" "$not_in_next" "$not_in_master"
+-	exit 1
+-fi
+-
+-<<\DOC_END
+-
+-This sample hook safeguards topic branches that have been
+-published from being rewound.
+-
+-The workflow assumed here is:
+-
+- * Once a topic branch forks from "master", "master" is never
+-   merged into it again (either directly or indirectly).
+-
+- * Once a topic branch is fully cooked and merged into "master",
+-   it is deleted.  If you need to build on top of it to correct
+-   earlier mistakes, a new topic branch is created by forking at
+-   the tip of the "master".  This is not strictly necessary, but
+-   it makes it easier to keep your history simple.
+-
+- * Whenever you need to test or publish your changes to topic
+-   branches, merge them into "next" branch.
+-
+-The script, being an example, hardcodes the publish branch name
+-to be "next", but it is trivial to make it configurable via
+-$GIT_DIR/config mechanism.
+-
+-With this workflow, you would want to know:
+-
+-(1) ... if a topic branch has ever been merged to "next".  Young
+-    topic branches can have stupid mistakes you would rather
+-    clean up before publishing, and things that have not been
+-    merged into other branches can be easily rebased without
+-    affecting other people.  But once it is published, you would
+-    not want to rewind it.
+-
+-(2) ... if a topic branch has been fully merged to "master".
+-    Then you can delete it.  More importantly, you should not
+-    build on top of it -- other people may already want to
+-    change things related to the topic as patches against your
+-    "master", so if you need further changes, it is better to
+-    fork the topic (perhaps with the same name) afresh from the
+-    tip of "master".
+-
+-Let's look at this example:
+-
+-		   o---o---o---o---o---o---o---o---o---o "next"
+-		  /       /           /           /
+-		 /   a---a---b A     /           /
+-		/   /               /           /
+-	       /   /   c---c---c---c B         /
+-	      /   /   /             \         /
+-	     /   /   /   b---b C     \       /
+-	    /   /   /   /             \     /
+-    ---o---o---o---o---o---o---o---o---o---o---o "master"
+-
+-
+-A, B and C are topic branches.
+-
+- * A has one fix since it was merged up to "next".
+-
+- * B has finished.  It has been fully merged up to "master" and "next",
+-   and is ready to be deleted.
+-
+- * C has not merged to "next" at all.
+-
+-We would want to allow C to be rebased, refuse A, and encourage
+-B to be deleted.
+-
+-To compute (1):
+-
+-	git rev-list ^master ^topic next
+-	git rev-list ^master        next
+-
+-	if these match, topic has not merged in next at all.
+-
+-To compute (2):
+-
+-	git rev-list master..topic
+-
+-	if this is empty, it is fully merged to "master".
+-
+-DOC_END
+diff -uNr ascend-llm/.git/hooks/pre-receive.sample ascend-llm-qwen/.git/hooks/pre-receive.sample
+--- ascend-llm/.git/hooks/pre-receive.sample	2024-09-04 19:20:58.894662300 +0800
++++ ascend-llm-qwen/.git/hooks/pre-receive.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,24 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to make use of push options.
+-# The example simply echoes all push options that start with 'echoback='
+-# and rejects all pushes when the "reject" push option is used.
+-#
+-# To enable this hook, rename this file to "pre-receive".
+-
+-if test -n "$GIT_PUSH_OPTION_COUNT"
+-then
+-	i=0
+-	while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
+-	do
+-		eval "value=\$GIT_PUSH_OPTION_$i"
+-		case "$value" in
+-		echoback=*)
+-			echo "echo from the pre-receive-hook: ${value#*=}" >&2
+-			;;
+-		reject)
+-			exit 1
+-		esac
+-		i=$((i + 1))
+-	done
+-fi
+diff -uNr ascend-llm/.git/hooks/prepare-commit-msg.sample ascend-llm-qwen/.git/hooks/prepare-commit-msg.sample
+--- ascend-llm/.git/hooks/prepare-commit-msg.sample	2024-09-04 19:20:58.894662300 +0800
++++ ascend-llm-qwen/.git/hooks/prepare-commit-msg.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,42 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to prepare the commit log message.
+-# Called by "git commit" with the name of the file that has the
+-# commit message, followed by the description of the commit
+-# message's source.  The hook's purpose is to edit the commit
+-# message file.  If the hook fails with a non-zero status,
+-# the commit is aborted.
+-#
+-# To enable this hook, rename this file to "prepare-commit-msg".
+-
+-# This hook includes three examples. The first one removes the
+-# "# Please enter the commit message..." help message.
+-#
+-# The second includes the output of "git diff --name-status -r"
+-# into the message, just before the "git status" output.  It is
+-# commented because it doesn't cope with --amend or with squashed
+-# commits.
+-#
+-# The third example adds a Signed-off-by line to the message, that can
+-# still be edited.  This is rarely a good idea.
+-
+-COMMIT_MSG_FILE=$1
+-COMMIT_SOURCE=$2
+-SHA1=$3
+-
+-/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
+-
+-# case "$COMMIT_SOURCE,$SHA1" in
+-#  ,|template,)
+-#    /usr/bin/perl -i.bak -pe '
+-#       print "\n" . `git diff --cached --name-status -r`
+-# 	 if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
+-#  *) ;;
+-# esac
+-
+-# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
+-# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
+-# if test -z "$COMMIT_SOURCE"
+-# then
+-#   /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
+-# fi
+diff -uNr ascend-llm/.git/hooks/push-to-checkout.sample ascend-llm-qwen/.git/hooks/push-to-checkout.sample
+--- ascend-llm/.git/hooks/push-to-checkout.sample	2024-09-04 19:20:58.895659900 +0800
++++ ascend-llm-qwen/.git/hooks/push-to-checkout.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,78 +0,0 @@
+-#!/bin/sh
+-
+-# An example hook script to update a checked-out tree on a git push.
+-#
+-# This hook is invoked by git-receive-pack(1) when it reacts to git
+-# push and updates reference(s) in its repository, and when the push
+-# tries to update the branch that is currently checked out and the
+-# receive.denyCurrentBranch configuration variable is set to
+-# updateInstead.
+-#
+-# By default, such a push is refused if the working tree and the index
+-# of the remote repository has any difference from the currently
+-# checked out commit; when both the working tree and the index match
+-# the current commit, they are updated to match the newly pushed tip
+-# of the branch. This hook is to be used to override the default
+-# behaviour; however the code below reimplements the default behaviour
+-# as a starting point for convenient modification.
+-#
+-# The hook receives the commit with which the tip of the current
+-# branch is going to be updated:
+-commit=$1
+-
+-# It can exit with a non-zero status to refuse the push (when it does
+-# so, it must not modify the index or the working tree).
+-die () {
+-	echo >&2 "$*"
+-	exit 1
+-}
+-
+-# Or it can make any necessary changes to the working tree and to the
+-# index to bring them to the desired state when the tip of the current
+-# branch is updated to the new commit, and exit with a zero status.
+-#
+-# For example, the hook can simply run git read-tree -u -m HEAD "$1"
+-# in order to emulate git fetch that is run in the reverse direction
+-# with git push, as the two-tree form of git read-tree -u -m is
+-# essentially the same as git switch or git checkout that switches
+-# branches while keeping the local changes in the working tree that do
+-# not interfere with the difference between the branches.
+-
+-# The below is a more-or-less exact translation to shell of the C code
+-# for the default behaviour for git's push-to-checkout hook defined in
+-# the push_to_deploy() function in builtin/receive-pack.c.
+-#
+-# Note that the hook will be executed from the repository directory,
+-# not from the working tree, so if you want to perform operations on
+-# the working tree, you will have to adapt your code accordingly, e.g.
+-# by adding "cd .." or using relative paths.
+-
+-if ! git update-index -q --ignore-submodules --refresh
+-then
+-	die "Up-to-date check failed"
+-fi
+-
+-if ! git diff-files --quiet --ignore-submodules --
+-then
+-	die "Working directory has unstaged changes"
+-fi
+-
+-# This is a rough translation of:
+-#
+-#   head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
+-if git cat-file -e HEAD 2>/dev/null
+-then
+-	head=HEAD
+-else
+-	head=$(git hash-object -t tree --stdin </dev/null)
+-fi
+-
+-if ! git diff-index --quiet --cached --ignore-submodules $head --
+-then
+-	die "Working directory has staged changes"
+-fi
+-
+-if ! git read-tree -u -m "$commit"
+-then
+-	die "Could not update working tree to new HEAD"
+-fi
+diff -uNr ascend-llm/.git/hooks/sendemail-validate.sample ascend-llm-qwen/.git/hooks/sendemail-validate.sample
+--- ascend-llm/.git/hooks/sendemail-validate.sample	2024-09-04 19:20:58.895659900 +0800
++++ ascend-llm-qwen/.git/hooks/sendemail-validate.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,77 +0,0 @@
+-#!/bin/sh
+-
+-# An example hook script to validate a patch (and/or patch series) before
+-# sending it via email.
+-#
+-# The hook should exit with non-zero status after issuing an appropriate
+-# message if it wants to prevent the email(s) from being sent.
+-#
+-# To enable this hook, rename this file to "sendemail-validate".
+-#
+-# By default, it will only check that the patch(es) can be applied on top of
+-# the default upstream branch without conflicts in a secondary worktree. After
+-# validation (successful or not) of the last patch of a series, the worktree
+-# will be deleted.
+-#
+-# The following config variables can be set to change the default remote and
+-# remote ref that are used to apply the patches against:
+-#
+-#   sendemail.validateRemote (default: origin)
+-#   sendemail.validateRemoteRef (default: HEAD)
+-#
+-# Replace the TODO placeholders with appropriate checks according to your
+-# needs.
+-
+-validate_cover_letter () {
+-	file="$1"
+-	# TODO: Replace with appropriate checks (e.g. spell checking).
+-	true
+-}
+-
+-validate_patch () {
+-	file="$1"
+-	# Ensure that the patch applies without conflicts.
+-	git am -3 "$file" || return
+-	# TODO: Replace with appropriate checks for this patch
+-	# (e.g. checkpatch.pl).
+-	true
+-}
+-
+-validate_series () {
+-	# TODO: Replace with appropriate checks for the whole series
+-	# (e.g. quick build, coding style checks, etc.).
+-	true
+-}
+-
+-# main -------------------------------------------------------------------------
+-
+-if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
+-then
+-	remote=$(git config --default origin --get sendemail.validateRemote) &&
+-	ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
+-	worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
+-	git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
+-	git config --replace-all sendemail.validateWorktree "$worktree"
+-else
+-	worktree=$(git config --get sendemail.validateWorktree)
+-fi || {
+-	echo "sendemail-validate: error: failed to prepare worktree" >&2
+-	exit 1
+-}
+-
+-unset GIT_DIR GIT_WORK_TREE
+-cd "$worktree" &&
+-
+-if grep -q "^diff --git " "$1"
+-then
+-	validate_patch "$1"
+-else
+-	validate_cover_letter "$1"
+-fi &&
+-
+-if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
+-then
+-	git config --unset-all sendemail.validateWorktree &&
+-	trap 'git worktree remove -ff "$worktree"' EXIT &&
+-	validate_series
+-fi
+diff -uNr ascend-llm/.git/hooks/update.sample ascend-llm-qwen/.git/hooks/update.sample
+--- ascend-llm/.git/hooks/update.sample	2024-09-04 19:20:58.895659900 +0800
++++ ascend-llm-qwen/.git/hooks/update.sample	1970-01-01 08:00:00.000000000 +0800
+@@ -1,128 +0,0 @@
+-#!/bin/sh
+-#
+-# An example hook script to block unannotated tags from entering.
+-# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
+-#
+-# To enable this hook, rename this file to "update".
+-#
+-# Config
+-# ------
+-# hooks.allowunannotated
+-#   This boolean sets whether unannotated tags will be allowed into the
+-#   repository.  By default they won't be.
+-# hooks.allowdeletetag
+-#   This boolean sets whether deleting tags will be allowed in the
+-#   repository.  By default they won't be.
+-# hooks.allowmodifytag
+-#   This boolean sets whether a tag may be modified after creation. By default
+-#   it won't be.
+-# hooks.allowdeletebranch
+-#   This boolean sets whether deleting branches will be allowed in the
+-#   repository.  By default they won't be.
+-# hooks.denycreatebranch
+-#   This boolean sets whether remotely creating branches will be denied
+-#   in the repository.  By default this is allowed.
+-#
+-
+-# --- Command line
+-refname="$1"
+-oldrev="$2"
+-newrev="$3"
+-
+-# --- Safety check
+-if [ -z "$GIT_DIR" ]; then
+-	echo "Don't run this script from the command line." >&2
+-	echo " (if you want, you could supply GIT_DIR then run" >&2
+-	echo "  $0 <ref> <oldrev> <newrev>)" >&2
+-	exit 1
+-fi
+-
+-if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
+-	echo "usage: $0 <ref> <oldrev> <newrev>" >&2
+-	exit 1
+-fi
+-
+-# --- Config
+-allowunannotated=$(git config --type=bool hooks.allowunannotated)
+-allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
+-denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
+-allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
+-allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
+-
+-# check for no description
+-projectdesc=$(sed -e '1q' "$GIT_DIR/description")
+-case "$projectdesc" in
+-"Unnamed repository"* | "")
+-	echo "*** Project description file hasn't been set" >&2
+-	exit 1
+-	;;
+-esac
+-
+-# --- Check types
+-# if $newrev is 0000...0000, it's a commit to delete a ref.
+-zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
+-if [ "$newrev" = "$zero" ]; then
+-	newrev_type=delete
+-else
+-	newrev_type=$(git cat-file -t $newrev)
+-fi
+-
+-case "$refname","$newrev_type" in
+-	refs/tags/*,commit)
+-		# un-annotated tag
+-		short_refname=${refname##refs/tags/}
+-		if [ "$allowunannotated" != "true" ]; then
+-			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
+-			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
+-			exit 1
+-		fi
+-		;;
+-	refs/tags/*,delete)
+-		# delete tag
+-		if [ "$allowdeletetag" != "true" ]; then
+-			echo "*** Deleting a tag is not allowed in this repository" >&2
+-			exit 1
+-		fi
+-		;;
+-	refs/tags/*,tag)
+-		# annotated tag
+-		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
+-		then
+-			echo "*** Tag '$refname' already exists." >&2
+-			echo "*** Modifying a tag is not allowed in this repository." >&2
+-			exit 1
+-		fi
+-		;;
+-	refs/heads/*,commit)
+-		# branch
+-		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
+-			echo "*** Creating a branch is not allowed in this repository" >&2
+-			exit 1
+-		fi
+-		;;
+-	refs/heads/*,delete)
+-		# delete branch
+-		if [ "$allowdeletebranch" != "true" ]; then
+-			echo "*** Deleting a branch is not allowed in this repository" >&2
+-			exit 1
+-		fi
+-		;;
+-	refs/remotes/*,commit)
+-		# tracking branch
+-		;;
+-	refs/remotes/*,delete)
+-		# delete tracking branch
+-		if [ "$allowdeletebranch" != "true" ]; then
+-			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
+-			exit 1
+-		fi
+-		;;
+-	*)
+-		# Anything else (is there anything else?)
+-		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
+-		exit 1
+-		;;
+-esac
+-
+-# --- Finished
+-exit 0
+Binary files ascend-llm/.git/index and ascend-llm-qwen/.git/index differ
+diff -uNr ascend-llm/.git/info/exclude ascend-llm-qwen/.git/info/exclude
+--- ascend-llm/.git/info/exclude	2024-09-04 19:20:58.895659900 +0800
++++ ascend-llm-qwen/.git/info/exclude	1970-01-01 08:00:00.000000000 +0800
+@@ -1,6 +0,0 @@
+-# git ls-files --others --exclude-from=.git/info/exclude
+-# Lines that start with '#' are comments.
+-# For a project mostly in C, the following would be a good set of
+-# exclude patterns (uncomment them if you want to use them):
+-# *.[oa]
+-# *~
+diff -uNr ascend-llm/.git/logs/HEAD ascend-llm-qwen/.git/logs/HEAD
+--- ascend-llm/.git/logs/HEAD	2024-09-04 19:28:06.528235700 +0800
++++ ascend-llm-qwen/.git/logs/HEAD	1970-01-01 08:00:00.000000000 +0800
+@@ -1,2 +0,0 @@
+-0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725448863 +0800	clone: from https://gitee.com/yinghuo302/ascend-llm.git
+-1392d7fccbf5fbf1bf4df781cca919abd046a80d 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725449286 +0800	checkout: moving from main to lscno2
+diff -uNr ascend-llm/.git/logs/refs/heads/lscno2 ascend-llm-qwen/.git/logs/refs/heads/lscno2
+--- ascend-llm/.git/logs/refs/heads/lscno2	2024-09-04 19:27:54.495854600 +0800
++++ ascend-llm-qwen/.git/logs/refs/heads/lscno2	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725449274 +0800	branch: Created from main
+diff -uNr ascend-llm/.git/logs/refs/heads/main ascend-llm-qwen/.git/logs/refs/heads/main
+--- ascend-llm/.git/logs/refs/heads/main	2024-09-04 19:21:03.028244000 +0800
++++ ascend-llm-qwen/.git/logs/refs/heads/main	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725448863 +0800	clone: from https://gitee.com/yinghuo302/ascend-llm.git
+diff -uNr ascend-llm/.git/logs/refs/remotes/origin/HEAD ascend-llm-qwen/.git/logs/refs/remotes/origin/HEAD
+--- ascend-llm/.git/logs/refs/remotes/origin/HEAD	2024-09-04 19:21:03.022263900 +0800
++++ ascend-llm-qwen/.git/logs/refs/remotes/origin/HEAD	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725448863 +0800	clone: from https://gitee.com/yinghuo302/ascend-llm.git
+Binary files ascend-llm/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.idx and ascend-llm-qwen/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.idx differ
+Binary files ascend-llm/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.pack and ascend-llm-qwen/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.pack differ
+Binary files ascend-llm/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.rev and ascend-llm-qwen/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.rev differ
+diff -uNr ascend-llm/.git/packed-refs ascend-llm-qwen/.git/packed-refs
+--- ascend-llm/.git/packed-refs	2024-09-04 19:21:03.020279000 +0800
++++ ascend-llm-qwen/.git/packed-refs	1970-01-01 08:00:00.000000000 +0800
+@@ -1,2 +0,0 @@
+-# pack-refs with: peeled fully-peeled sorted 
+-1392d7fccbf5fbf1bf4df781cca919abd046a80d refs/remotes/origin/main
+diff -uNr ascend-llm/.git/refs/heads/lscno2 ascend-llm-qwen/.git/refs/heads/lscno2
+--- ascend-llm/.git/refs/heads/lscno2	2024-09-04 19:27:54.495854600 +0800
++++ ascend-llm-qwen/.git/refs/heads/lscno2	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-1392d7fccbf5fbf1bf4df781cca919abd046a80d
+diff -uNr ascend-llm/.git/refs/heads/main ascend-llm-qwen/.git/refs/heads/main
+--- ascend-llm/.git/refs/heads/main	2024-09-04 19:21:03.028244000 +0800
++++ ascend-llm-qwen/.git/refs/heads/main	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-1392d7fccbf5fbf1bf4df781cca919abd046a80d
+diff -uNr ascend-llm/.git/refs/remotes/origin/HEAD ascend-llm-qwen/.git/refs/remotes/origin/HEAD
+--- ascend-llm/.git/refs/remotes/origin/HEAD	2024-09-04 19:21:03.022263900 +0800
++++ ascend-llm-qwen/.git/refs/remotes/origin/HEAD	1970-01-01 08:00:00.000000000 +0800
+@@ -1 +0,0 @@
+-ref: refs/remotes/origin/main
+diff -uNr ascend-llm/.gitignore ascend-llm-qwen/.gitignore
+--- ascend-llm/.gitignore	2024-09-04 19:21:03.040203700 +0800
++++ ascend-llm-qwen/.gitignore	1970-01-01 08:00:00.000000000 +0800
+@@ -1,2 +0,0 @@
+-*.pyc
+-inference/dist/*
+\ No newline at end of file
+diff -uNr ascend-llm/LICENSE ascend-llm-qwen/LICENSE
+--- ascend-llm/LICENSE	2024-09-04 19:21:03.040203700 +0800
++++ ascend-llm-qwen/LICENSE	1970-01-01 08:00:00.000000000 +0800
+@@ -1,201 +0,0 @@
+-                                 Apache License
+-                           Version 2.0, January 2004
+-                        http://www.apache.org/licenses/
+-
+-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+-
+-   1. Definitions.
+-
+-      "License" shall mean the terms and conditions for use, reproduction,
+-      and distribution as defined by Sections 1 through 9 of this document.
+-
+-      "Licensor" shall mean the copyright owner or entity authorized by
+-      the copyright owner that is granting the License.
+-
+-      "Legal Entity" shall mean the union of the acting entity and all
+-      other entities that control, are controlled by, or are under common
+-      control with that entity. For the purposes of this definition,
+-      "control" means (i) the power, direct or indirect, to cause the
+-      direction or management of such entity, whether by contract or
+-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+-      outstanding shares, or (iii) beneficial ownership of such entity.
+-
+-      "You" (or "Your") shall mean an individual or Legal Entity
+-      exercising permissions granted by this License.
+-
+-      "Source" form shall mean the preferred form for making modifications,
+-      including but not limited to software source code, documentation
+-      source, and configuration files.
+-
+-      "Object" form shall mean any form resulting from mechanical
+-      transformation or translation of a Source form, including but
+-      not limited to compiled object code, generated documentation,
+-      and conversions to other media types.
+-
+-      "Work" shall mean the work of authorship, whether in Source or
+-      Object form, made available under the License, as indicated by a
+-      copyright notice that is included in or attached to the work
+-      (an example is provided in the Appendix below).
+-
+-      "Derivative Works" shall mean any work, whether in Source or Object
+-      form, that is based on (or derived from) the Work and for which the
+-      editorial revisions, annotations, elaborations, or other modifications
+-      represent, as a whole, an original work of authorship. For the purposes
+-      of this License, Derivative Works shall not include works that remain
+-      separable from, or merely link (or bind by name) to the interfaces of,
+-      the Work and Derivative Works thereof.
+-
+-      "Contribution" shall mean any work of authorship, including
+-      the original version of the Work and any modifications or additions
+-      to that Work or Derivative Works thereof, that is intentionally
+-      submitted to Licensor for inclusion in the Work by the copyright owner
+-      or by an individual or Legal Entity authorized to submit on behalf of
+-      the copyright owner. For the purposes of this definition, "submitted"
+-      means any form of electronic, verbal, or written communication sent
+-      to the Licensor or its representatives, including but not limited to
+-      communication on electronic mailing lists, source code control systems,
+-      and issue tracking systems that are managed by, or on behalf of, the
+-      Licensor for the purpose of discussing and improving the Work, but
+-      excluding communication that is conspicuously marked or otherwise
+-      designated in writing by the copyright owner as "Not a Contribution."
+-
+-      "Contributor" shall mean Licensor and any individual or Legal Entity
+-      on behalf of whom a Contribution has been received by Licensor and
+-      subsequently incorporated within the Work.
+-
+-   2. Grant of Copyright License. Subject to the terms and conditions of
+-      this License, each Contributor hereby grants to You a perpetual,
+-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+-      copyright license to reproduce, prepare Derivative Works of,
+-      publicly display, publicly perform, sublicense, and distribute the
+-      Work and such Derivative Works in Source or Object form.
+-
+-   3. Grant of Patent License. Subject to the terms and conditions of
+-      this License, each Contributor hereby grants to You a perpetual,
+-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+-      (except as stated in this section) patent license to make, have made,
+-      use, offer to sell, sell, import, and otherwise transfer the Work,
+-      where such license applies only to those patent claims licensable
+-      by such Contributor that are necessarily infringed by their
+-      Contribution(s) alone or by combination of their Contribution(s)
+-      with the Work to which such Contribution(s) was submitted. If You
+-      institute patent litigation against any entity (including a
+-      cross-claim or counterclaim in a lawsuit) alleging that the Work
+-      or a Contribution incorporated within the Work constitutes direct
+-      or contributory patent infringement, then any patent licenses
+-      granted to You under this License for that Work shall terminate
+-      as of the date such litigation is filed.
+-
+-   4. Redistribution. You may reproduce and distribute copies of the
+-      Work or Derivative Works thereof in any medium, with or without
+-      modifications, and in Source or Object form, provided that You
+-      meet the following conditions:
+-
+-      (a) You must give any other recipients of the Work or
+-          Derivative Works a copy of this License; and
+-
+-      (b) You must cause any modified files to carry prominent notices
+-          stating that You changed the files; and
+-
+-      (c) You must retain, in the Source form of any Derivative Works
+-          that You distribute, all copyright, patent, trademark, and
+-          attribution notices from the Source form of the Work,
+-          excluding those notices that do not pertain to any part of
+-          the Derivative Works; and
+-
+-      (d) If the Work includes a "NOTICE" text file as part of its
+-          distribution, then any Derivative Works that You distribute must
+-          include a readable copy of the attribution notices contained
+-          within such NOTICE file, excluding those notices that do not
+-          pertain to any part of the Derivative Works, in at least one
+-          of the following places: within a NOTICE text file distributed
+-          as part of the Derivative Works; within the Source form or
+-          documentation, if provided along with the Derivative Works; or,
+-          within a display generated by the Derivative Works, if and
+-          wherever such third-party notices normally appear. The contents
+-          of the NOTICE file are for informational purposes only and
+-          do not modify the License. You may add Your own attribution
+-          notices within Derivative Works that You distribute, alongside
+-          or as an addendum to the NOTICE text from the Work, provided
+-          that such additional attribution notices cannot be construed
+-          as modifying the License.
+-
+-      You may add Your own copyright statement to Your modifications and
+-      may provide additional or different license terms and conditions
+-      for use, reproduction, or distribution of Your modifications, or
+-      for any such Derivative Works as a whole, provided Your use,
+-      reproduction, and distribution of the Work otherwise complies with
+-      the conditions stated in this License.
+-
+-   5. Submission of Contributions. Unless You explicitly state otherwise,
+-      any Contribution intentionally submitted for inclusion in the Work
+-      by You to the Licensor shall be under the terms and conditions of
+-      this License, without any additional terms or conditions.
+-      Notwithstanding the above, nothing herein shall supersede or modify
+-      the terms of any separate license agreement you may have executed
+-      with Licensor regarding such Contributions.
+-
+-   6. Trademarks. This License does not grant permission to use the trade
+-      names, trademarks, service marks, or product names of the Licensor,
+-      except as required for reasonable and customary use in describing the
+-      origin of the Work and reproducing the content of the NOTICE file.
+-
+-   7. Disclaimer of Warranty. Unless required by applicable law or
+-      agreed to in writing, Licensor provides the Work (and each
+-      Contributor provides its Contributions) on an "AS IS" BASIS,
+-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+-      implied, including, without limitation, any warranties or conditions
+-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+-      PARTICULAR PURPOSE. You are solely responsible for determining the
+-      appropriateness of using or redistributing the Work and assume any
+-      risks associated with Your exercise of permissions under this License.
+-
+-   8. Limitation of Liability. In no event and under no legal theory,
+-      whether in tort (including negligence), contract, or otherwise,
+-      unless required by applicable law (such as deliberate and grossly
+-      negligent acts) or agreed to in writing, shall any Contributor be
+-      liable to You for damages, including any direct, indirect, special,
+-      incidental, or consequential damages of any character arising as a
+-      result of this License or out of the use or inability to use the
+-      Work (including but not limited to damages for loss of goodwill,
+-      work stoppage, computer failure or malfunction, or any and all
+-      other commercial damages or losses), even if such Contributor
+-      has been advised of the possibility of such damages.
+-
+-   9. Accepting Warranty or Additional Liability. While redistributing
+-      the Work or Derivative Works thereof, You may choose to offer,
+-      and charge a fee for, acceptance of support, warranty, indemnity,
+-      or other liability obligations and/or rights consistent with this
+-      License. However, in accepting such obligations, You may act only
+-      on Your own behalf and on Your sole responsibility, not on behalf
+-      of any other Contributor, and only if You agree to indemnify,
+-      defend, and hold each Contributor harmless for any liability
+-      incurred by, or claims asserted against, such Contributor by reason
+-      of your accepting any such warranty or additional liability.
+-
+-   END OF TERMS AND CONDITIONS
+-
+-   APPENDIX: How to apply the Apache License to your work.
+-
+-      To apply the Apache License to your work, attach the following
+-      boilerplate notice, with the fields enclosed by brackets "[]"
+-      replaced with your own identifying information. (Don't include
+-      the brackets!)  The text should be enclosed in the appropriate
+-      comment syntax for the file format. We also recommend that a
+-      file or class name and description of purpose be included on the
+-      same "printed page" as the copyright notice for easier
+-      identification within third-party archives.
+-
+-   Copyright [yyyy] [name of copyright owner]
+-
+-   Licensed under the Apache License, Version 2.0 (the "License");
+-   you may not use this file except in compliance with the License.
+-   You may obtain a copy of the License at
+-
+-       http://www.apache.org/licenses/LICENSE-2.0
+-
+-   Unless required by applicable law or agreed to in writing, software
+-   distributed under the License is distributed on an "AS IS" BASIS,
+-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-   See the License for the specific language governing permissions and
+-   limitations under the License.
+Binary files ascend-llm/assets/webui.png and ascend-llm-qwen/assets/webui.png differ
+diff -uNr ascend-llm/custom_op/matmul_integer_plugin.cc ascend-llm-qwen/custom_op/matmul_integer_plugin.cc
+--- ascend-llm/custom_op/matmul_integer_plugin.cc	2024-09-04 19:21:03.043202000 +0800
++++ ascend-llm-qwen/custom_op/matmul_integer_plugin.cc	1970-01-01 08:00:00.000000000 +0800
+@@ -1,31 +0,0 @@
+-/* Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the Apache License Version 2.0.
+- * You may not use this file except in compliance with the License.
+- *
+- * This program is distributed in the hope that it will be useful,
+- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+- * Apache License for more details at
+- * http://www.apache.org/licenses/LICENSE-2.0
+- */
+-
+-#include "register/register.h"
+-
+-namespace domi {
+-Status ParseParamsMatmulInteger(const ge::Operator& op_src, ge::Operator& op_dest) {
+-  return SUCCESS;
+-}
+-
+-REGISTER_CUSTOM_OP("BatchMatMulV2")
+-    .FrameworkType(ONNX)
+-    .OriginOpType({ge::AscendString("ai.onnx::14::MatMulInteger"),
+-                   ge::AscendString("ai.onnx::15::MatMulInteger"),
+-                   ge::AscendString("ai.onnx::10::MatMulInteger"),
+-                   ge::AscendString("ai.onnx::11::MatMulInteger"),
+-                   ge::AscendString("ai.onnx::12::MatMulInteger"),
+-                   ge::AscendString("ai.onnx::13::MatMulInteger")})
+-    .ParseParamsByOperatorFn(ParseParamsMatmulInteger)
+-    .ImplyType(ImplyType::TVM);
+-}  // namespace domi
+Binary files ascend-llm/export_llama/act_scales/llama-2-7b.pt and ascend-llm-qwen/export_llama/act_scales/llama-2-7b.pt differ
+Binary files ascend-llm/export_llama/act_scales/tiny-llama.pt and ascend-llm-qwen/export_llama/act_scales/tiny-llama.pt differ
+diff -uNr ascend-llm/export_llama/change_node.py ascend-llm-qwen/export_llama/change_node.py
+--- ascend-llm/export_llama/change_node.py	2024-09-04 19:21:03.074127700 +0800
++++ ascend-llm-qwen/export_llama/change_node.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,55 +0,0 @@
+-import argparse
+-import onnx
+-import onnx.helper as helper
+-from onnx import TensorProto
+-def change_node(in_path,out_path):
+-	model = onnx.load(in_path)
+-	new_nodes = []
+-
+-	for node in model.graph.node:
+-		# 判断节点类型
+-		new_node = node
+-		if node.op_type == "Cast":
+-			# 替换为新的算子类型, 昇腾Cast fp16 -> int8 有精度问题，暂时用AscendQuant
+-			to_attribute = next(attr for attr in node.attribute if attr.name == "to")
+-			if to_attribute.i == TensorProto.INT8:
+-				new_node = helper.make_node(
+-					"AscendQuant",
+-					inputs=node.input,
+-					outputs=node.output,
+-					offset=0.,
+-					scale=1.,
+-				)
+-		new_nodes.append(new_node)
+-
+-	new_graph = helper.make_graph(
+-		new_nodes,
+-		"new_graph",
+-		inputs=model.graph.input,
+-		outputs=model.graph.output,
+-		value_info=model.graph.value_info,
+-		initializer=model.graph.initializer
+-	)
+-
+-	new_model = helper.make_model(new_graph, producer_name=model.producer_name,opset_imports=model.opset_import,ir_version = model.ir_version)
+-	# new_model.ir_version = model.ir_version
+-	# new_model.opset_import = model.opset_import
+-	# new_model.metadata_props = model.metadata_props
+-	onnx.save(new_model, out_path,save_as_external_data=True,size_threshold=0,convert_attribute=True)
+-
+-if __name__ == "__main__":
+-    parser = argparse.ArgumentParser()
+-    parser.add_argument(
+-        "--input", 
+-		type=str,
+-		default="./model/export_out/tiny-llama.onnx",
+-		help="path to onnx model that need to be processed"
+-    )
+-    parser.add_argument(
+-        "--output",
+-        type=str,
+-        default="./model/change_node_out/tiny-llama.onnx",
+-        help="where to save new onnx model",
+-    )
+-    args = parser.parse_args()
+-    change_node(args.input,args.output)
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/config/no.py ascend-llm-qwen/export_llama/config/no.py
+--- ascend-llm/export_llama/config/no.py	2024-09-04 19:21:03.074127700 +0800
++++ ascend-llm-qwen/export_llama/config/no.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,3 +0,0 @@
+-# 不进行量化
+-def get(model_cfg,act_max):
+-	return {} 
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/config/sd.py ascend-llm-qwen/export_llama/config/sd.py
+--- ascend-llm/export_llama/config/sd.py	2024-09-04 19:21:03.074127700 +0800
++++ ascend-llm-qwen/export_llama/config/sd.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,21 +0,0 @@
+-# 静态混合精度分解
+-def get(model_cfg,act_max):
+-    quant_cfg = {}
+-    h_mx,d_mx = findN(0.04 * model_cfg.hidden_size),findN(0.1 * model_cfg.intermediate_size)
+-    scale,step = 4, 4/model_cfg.num_hidden_layers
+-    for i in range(model_cfg.num_hidden_layers):
+-        scale = max(0,scale-step)
+-        h_cur,d_cur = max(16,h_mx >> int(scale)), max(32,d_mx >> int(scale))
+-        for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]:
+-            quant_cfg[str(i)+"."+name] = {"type":"W8SD","act_scale":True,"alpha":h_cur}
+-        quant_cfg[str(i)+".down_proj"] = {"type":"W8SD","act_scale":True,"alpha":d_cur}
+-    quant_cfg["lm_head"] = {"type":"W8SD"}
+-    quant_cfg["act_scales_path"] = act_max
+-    return quant_cfg
+-
+-def findN(N):
+-    sum = 1; 
+-    while True:
+-        if sum * 2 > N:
+-            return sum	
+-        sum = sum * 2
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/config/smooth.py ascend-llm-qwen/export_llama/config/smooth.py
+--- ascend-llm/export_llama/config/smooth.py	2024-09-04 19:21:03.075092900 +0800
++++ ascend-llm-qwen/export_llama/config/smooth.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,13 +0,0 @@
+-# 平滑激活
+-def get(model_cfg,act_max):
+-	quant_cfg = {}
+-	for i in range(model_cfg.num_hidden_layers):
+-		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]:
+-			quant_cfg[str(i)+"."+name] = {"type":"W8X8"}
+-		# 对某一个具体的层加act_scale的作用： 若为W8X8，则对该层进行smooth；如为W8SD，则用act_scale进行混合精度分解。
+-		quant_cfg[str(i)+".down_proj"] = {"type":"W8X8","act_scale":True,"alpha":0.85} 
+-	quant_cfg["lm_head"] = {"type":"W8X8","act_scale":True,"alpha":0.85}
+-	quant_cfg["act_scales_path"] = act_max
+-	quant_cfg["alpha"] = 0.85  # smoothquant 迁移系数
+-	quant_cfg["smooth"] = True # 整体的smooth控制是将激活值的缩放与RMSNorm融合，不会造成额外的开销，但down_proj层无法使用
+-	return quant_cfg
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/config/smsd.py ascend-llm-qwen/export_llama/config/smsd.py
+--- ascend-llm/export_llama/config/smsd.py	2024-09-04 19:21:03.075092900 +0800
++++ ascend-llm-qwen/export_llama/config/smsd.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,22 +0,0 @@
+-# 对down_proj混合精度分解，对其他部分平滑激活
+-def get(model_cfg,act_max):
+-    quant_cfg = {}
+-    d_mx = findN(0.1 * model_cfg.intermediate_size)
+-    scale,step = 4, 4/model_cfg.num_hidden_layers
+-    for i in range(model_cfg.num_hidden_layers):
+-        scale = max(0,scale-step)
+-        d_cur = max(32,d_mx >> int(scale))
+-        for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]:
+-            quant_cfg[str(i)+"."+name] = {"type":"W8X8"}
+-        quant_cfg[str(i)+".down_proj"] = {"type":"W8SD","act_scale":True,"alpha":d_cur}
+-    quant_cfg["lm_head"] = {"type":"W8SD","act_scale":True,"alpha":64}
+-    quant_cfg["act_scales_path"] = act_max
+-    quant_cfg["smooth"] = True
+-    return quant_cfg
+-
+-def findN(N):
+-    sum = 1; 
+-    while True:
+-        if sum * 2 > N:
+-            return sum	
+-        sum = sum * 2
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/config/w8.py ascend-llm-qwen/export_llama/config/w8.py
+--- ascend-llm/export_llama/config/w8.py	2024-09-04 19:21:03.075092900 +0800
++++ ascend-llm-qwen/export_llama/config/w8.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,8 +0,0 @@
+-# 仅权重int8量化
+-def get(model_cfg,act_max):
+-	quant_cfg = {}
+-	for i in range(model_cfg.num_hidden_layers):
+-		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]:
+-			quant_cfg[str(i)+"."+name] = {"type":"W8"}
+-	quant_cfg["lm_head"] = {"type":"W8"}
+-	return quant_cfg
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/config/w8dx.py ascend-llm-qwen/export_llama/config/w8dx.py
+--- ascend-llm/export_llama/config/w8dx.py	2024-09-04 19:21:03.075092900 +0800
++++ ascend-llm-qwen/export_llama/config/w8dx.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,10 +0,0 @@
+-# 动态混合精度分解
+-def get(model_cfg,act_max):
+-	quant_cfg = {}
+-	for i in range(model_cfg.num_hidden_layers):
+-		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]:
+-			quant_cfg[str(i)+"."+name] = {"type":"W8DX"}
+-	# quant_cfg["lm_head"] = {"type":"W8DX"}  # 可以根据需要取消注释
+-	# quant_cfg["act_scales_path"] = act_max # 可以根据需要取消注释
+-	# quant_cfg["smooth"] = True # 可以根据需要取消注释
+-	return quant_cfg
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/config/w8x8.py ascend-llm-qwen/export_llama/config/w8x8.py
+--- ascend-llm/export_llama/config/w8x8.py	2024-09-04 19:21:03.075092900 +0800
++++ ascend-llm-qwen/export_llama/config/w8x8.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,8 +0,0 @@
+-# per-token absmax量化
+-def get(model_cfg,act_max):
+-	quant_cfg = {}
+-	for i in range(model_cfg.num_hidden_layers):
+-		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]:
+-			quant_cfg[str(i)+"."+name] = {"type":"W8X8"}
+-	quant_cfg["lm_head"] = {"type":"W8X8"}
+-	return quant_cfg
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/eval.py ascend-llm-qwen/export_llama/eval.py
+--- ascend-llm/export_llama/eval.py	2024-09-04 19:21:03.076083700 +0800
++++ ascend-llm-qwen/export_llama/eval.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,161 +0,0 @@
+-import argparse
+-import importlib
+-import json
+-import lm_eval
+-from lm_eval.models.huggingface import HFLM
+-from lm_eval.utils import make_table
+-import torch
+-import tqdm
+-from datasets import load_dataset
+-from transformers import AutoTokenizer,AutoModelForCausalLM
+-import datetime
+-
+-print_ = print
+-
+-def lm_eval_fn(args):
+-    global print_
+-    lm_obj = HFLM(pretrained=args.model,tokenizer=args.tokenizer, batch_size="auto")
+-    task_manager = lm_eval.tasks.TaskManager()
+-
+-    results = lm_eval.simple_evaluate( # call simple_evaluate
+-        model=lm_obj,
+-        tasks=args.tasks,
+-        num_fewshot=0,
+-        task_manager=task_manager,
+-    )
+-    # now = datetime.datetime.now()
+-    # with open(f'eval-{now.month:02}-{now.day:02}-{now.hour:02}:{now.minute:02}:{now.second:02}.json', 'w') as f:
+-    #     json.dump(results, f)
+-    print_(make_table(results))    
+-
+-def ppl_eval_fn(args):
+-    global print_
+-    def evaluate_perplexity(model, tokenizer,dataset):
+-        def _perplexity(nlls, n_samples, seqlen):
+-            return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))
+-        data = None
+-        if dataset == "wikitext":
+-            data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+-            data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
+-        elif dataset == "c4":
+-            data = load_dataset('allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
+-            data = tokenizer(" ".join(data[:]['text']), return_tensors="pt")
+-        else:
+-            raise f"Not support ppl eval dataset:{dataset}"
+-        data = data.input_ids.to(model.device)
+-        seqlen = 2048
+-        model = model.eval()
+-        n_samples = data.numel() // seqlen
+-
+-        nlls = []
+-
+-        with tqdm.tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
+-            for i in progress_bar:
+-                start_index = i * seqlen
+-                end_index = (i + 1) * seqlen
+-                batch = data[:, start_index:end_index].to(model.device)
+-                with torch.no_grad():
+-                    logits = model(batch).logits
+-                shift_logits = logits[:, :-1, :].contiguous().float()
+-                shift_labels = data[:, start_index:end_index][:, 1:]
+-                loss_fct = torch.nn.CrossEntropyLoss()
+-                loss = loss_fct(
+-                    shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+-                )
+-                neg_log_likelihood = loss.float() * seqlen
+-                nlls.append(neg_log_likelihood)
+-
+-                curr_ppl = _perplexity(nlls, i + 1, seqlen)
+-                progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")
+-
+-        ppl = _perplexity(nlls, n_samples, seqlen)
+-        print_(f"Perplexity on {dataset}: {ppl.item()}")
+-        return ppl.item()
+-    for dataset in args.datasets:
+-        print_(f"\n-----------------begin test ppl on dataset {dataset}-------------------\n")
+-        evaluate_perplexity(args.model,args.tokenizer,dataset)
+-
+-def run_test(args,title:str):
+-    global print_
+-    print_(f"\n-------------------------{title}-----------------------------\n")
+-    if "ppl" in args.tasks:
+-        ppl_eval_fn(args)
+-        args.tasks.remove("ppl")
+-    if len(args.tasks) != 0:
+-        lm_eval_fn(args)
+-
+-def parse_args():
+-    now = datetime.datetime.now()
+-    parser = argparse.ArgumentParser()
+-    parser.add_argument(
+-        "--model","-m", 
+-        type=str, default="./model/TinyLlama-1.1B-Chat-v1.0",
+-        help="path to model or hugging face model id"
+-    )
+-    parser.add_argument(
+-        "--output","-o",
+-        type=str,
+-        default=f"./result-{now.month:02}-{now.day:02}-{now.hour:02}:{now.minute:02}:{now.second:02}.log",
+-        help="where to save eval result",
+-    )
+-    parser.add_argument(
+-        "--datasets","-d",
+-        type=str,
+-        default="wikitext,c4",
+-        help=" the dataset used to eval perplexity",
+-    )
+-    parser.add_argument(
+-        "--tasks","-t",
+-        type=str,
+-        default="mmlu,ppl,lambada_openai,boolq,arc_easy,arc_challenge,piqa,winogrande",
+-        help="tasks parameter for lm-evaluation-harness",
+-    )
+-    parser.add_argument(
+-        "--act-path","-a",
+-        type=str,
+-        default="./act_scales/llama-2-7b.pt",
+-        help="path to act_scales",
+-    )
+-    parser.add_argument(
+-        "--quant","-q",
+-        type=str,
+-        default="./config/w8x8.py",
+-        help="path to quant config",
+-    )
+-    return parser.parse_args()
+-
+-
+-def main():
+-    import os
+-    os.chdir(os.path.dirname(__file__))
+-    args = parse_args()
+-    args.datasets = args.datasets.split(",")
+-    model_name = args.model.split("/")[-1]
+-    setattr(args,"tokenizer",AutoTokenizer.from_pretrained(args.model))
+-    setattr(args,"model",AutoModelForCausalLM.\
+-            from_pretrained(args.model,torch_dtype=torch.float16,device_map="auto"))
+-    args.model.eval()
+-    out_f = open(args.output,"w")
+-    def print_fn(*value:object,sep=" ",end="\n",file=None,flush=False):
+-        out_f.write(sep.join([str(v) for v in value])+end)
+-        print(*value,sep=sep,end=end,file=file,flush=flush)
+-    global print_
+-    print_ = print_fn
+-    args.tasks = args.tasks.split(",")
+-    flag = "ppl" not in args.tasks
+-    run_test(args,f"test {model_name}")
+-    args.tasks = args.tasks if flag else (args.tasks + ["ppl"])
+-    # quantize 
+-    model_cfg=args.model.model.config
+-    spec = importlib.util.spec_from_file_location("quant_cfg_module", args.quant)
+-    quant_cfg_module = importlib.util.module_from_spec(spec)
+-    spec.loader.exec_module(quant_cfg_module)
+-    quantize_cfg = quant_cfg_module.get(model_cfg,args.act_path)
+-    from quantize import quantize
+-    quantize(args.model,quantize_cfg)
+-    
+-    run_test(args,f"test quantized {model_name}")
+-    out_f.close()
+-
+-if __name__ == "__main__":
+-    main()
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/export_llama.py ascend-llm-qwen/export_llama/export_llama.py
+--- ascend-llm/export_llama/export_llama.py	2024-09-04 19:49:56.471989100 +0800
++++ ascend-llm-qwen/export_llama/export_llama.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,97 +0,0 @@
+-import argparse
+-import importlib
+-import torch
+-import os
+-from transformers import LlamaForCausalLM, LlamaTokenizer
+-
+-
+-def export_onnx(base_model,out_path,quant_cfg_path,act_path):
+-    tokenizer= LlamaTokenizer.from_pretrained(base_model)
+-    model = LlamaForCausalLM.from_pretrained(
+-        base_model,
+-        torch_dtype=torch.float16,
+-        device_map="auto",
+-    )
+-    model_cfg=model.model.config
+-    spec = importlib.util.spec_from_file_location("quant_cfg_module", quant_cfg_path)
+-    quant_cfg_module = importlib.util.module_from_spec(spec)
+-    spec.loader.exec_module(quant_cfg_module)
+-    quantize_cfg = quant_cfg_module.get(model_cfg,act_path)
+-    from quantize import quantize
+-    quantize(model,quantize_cfg)
+-    
+-    input_names = ["input_ids", "attention_mask", "position_ids","past_key_values"]
+-    output_names = ["logits","out_key_values","attn_scores"]
+-    dynamic_axes = {
+-        "input_ids": { 0: "batch_size", 1: "seq_length" },
+-        "attention_mask": { 0: "batch_size",1:"all_len" },
+-        "position_ids": { 0: "batch_size", 1: "seq_length" },
+-        "past_key_values": { 2: "batch_size", 4: "kv_len" },
+-    }
+-    
+-    batch_size,seq_len,kv_len=1,16,1024
+-    all_len = seq_len + kv_len
+-    n_layers,n_heads,hidden_size=model_cfg.num_hidden_layers,model_cfg.num_key_value_heads,model_cfg.hidden_size
+-    head_dim = int(model_cfg.hidden_size / model_cfg.num_attention_heads)
+-
+-
+-    input_ids = torch.zeros((batch_size,seq_len)).long().to("cuda") # batch_size, new_sequence_length
+-    attention_mask = torch.zeros((batch_size,all_len)).long().to("cuda") # batch_size, all_sequence_length
+-    position_ids = torch.zeros((batch_size,seq_len)).long().to("cuda") # batch_size, new_sequence_length
+-    # past_keys = torch.rand((batch_size,  n_heads,kv_len, head_dim),dtype=torch.float16).to("cuda")
+-    # past_values = torch.rand((batch_size,n_heads, kv_len, head_dim),dtype=torch.float16).to("cuda")
+-    # past_key_values = tuple([(past_keys,past_values)] * n_layers)
+-    past_key_values = torch.rand((n_layers,2,batch_size,n_heads, kv_len, head_dim),dtype=torch.float16).to("cuda")
+-    input_args = (
+-        input_ids,
+-        attention_mask,
+-        position_ids,
+-        past_key_values,
+-        None, # inputs_embeds: Optional[torch.FloatTensor] = None,
+-        None, #labels: Optional[torch.LongTensor] = None,
+-        True, #use_cache: Optional[bool] = None,
+-        True # output_attentions: Optional[bool] = None,
+-    )
+-
+-    model.eval()
+-    torch.onnx.export(
+-        model,
+-        f=out_path,
+-        args=input_args,
+-        input_names=input_names,
+-        output_names=output_names,
+-        dynamic_axes=dynamic_axes,
+-        opset_version=13,
+-        export_params=True,
+-    )
+-
+-if __name__ == "__main__":
+-    import os
+-    os.chdir(os.path.dirname(__file__))
+-    parser = argparse.ArgumentParser()
+-    parser.add_argument(
+-        "--model", "-m",
+-        type=str, 
+-        default="./model/TinyLlama-1.1B-Chat-v1.0", 
+-        help="transformers model"
+-    )
+-    parser.add_argument(
+-        "--output","-o",
+-        type=str,
+-        default="./model/export_out/tiny-llama.onnx",
+-        help="where to save onnx model",
+-    )
+-    parser.add_argument(
+-        "--act-path","-a",
+-        type=str,
+-        default="./act_scales/llama-2-7b.pt",
+-        help="path to act_scales",
+-    )
+-    parser.add_argument(
+-        "--quant","-q",
+-        type=str,
+-        default="./config/w8x8.py",
+-        help="path to quant config",
+-    )
+-    args = parser.parse_args()
+-    export_onnx(args.model,args.output,args.quant,args.act_path)
+diff -uNr ascend-llm/export_llama/generate_act_scales.py ascend-llm-qwen/export_llama/generate_act_scales.py
+--- ascend-llm/export_llama/generate_act_scales.py	2024-09-04 19:21:03.076083700 +0800
++++ ascend-llm-qwen/export_llama/generate_act_scales.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,109 +0,0 @@
+-'''
+-code from https://github.com/mit-han-lab/smoothquant/
+-'''
+-from datasets import load_dataset
+-import functools
+-from collections import defaultdict
+-
+-from functools import partial
+-import numpy as np
+-from tqdm import tqdm
+-import torch
+-import os
+-
+-from transformers import (
+-    AutoModelForCausalLM,
+-    AutoTokenizer,
+-)
+-import argparse
+-
+-def get_act_scales(model, tokenizer, dataset_path, num_samples=512, seq_len=512):
+-    model.eval()
+-    device = next(model.parameters()).device
+-    act_scales = {}
+-
+-    def stat_tensor(name, tensor):
+-        hidden_dim = tensor.shape[-1]
+-        tensor = tensor.view(-1, hidden_dim).abs().detach()
+-        comming_max = torch.max(tensor, dim=0)[0].float().cpu()
+-        if name in act_scales:
+-            act_scales[name] = torch.max(act_scales[name], comming_max)
+-        else:
+-            act_scales[name] = comming_max
+-
+-    def stat_input_hook(m, x, y, name):
+-        if isinstance(x, tuple):
+-            x = x[0]
+-        stat_tensor(name, x)
+-
+-    hooks = []
+-    for name, m in model.named_modules():
+-        if isinstance(m,torch.nn.Linear):
+-            hooks.append(
+-                m.register_forward_hook(functools.partial(stat_input_hook, name=name))
+-            )
+-
+-    dataset = load_dataset("json", data_files=dataset_path,split="train")
+-    dataset = dataset.shuffle(seed=42)
+-
+-    for i in tqdm(range(num_samples)):
+-        text = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(
+-			instruction=dataset["instruction"][i], input=dataset["output"][i]
+-		)
+-        input_ids = tokenizer(
+-            text, return_tensors="pt", max_length=seq_len, truncation=True
+-        ).input_ids.to(device)
+-        model(input_ids)
+-
+-    for h in hooks:
+-        h.remove()
+-
+-    return act_scales
+-
+-
+-
+-def build_model_and_tokenizer(model_name):
+-    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
+-    kwargs = {"torch_dtype": torch.float16, "device_map": "sequential"}
+-    model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
+-    return model, tokenizer
+-
+-
+-def parse_args():
+-    parser = argparse.ArgumentParser()
+-    parser.add_argument(
+-        "--model-name", type=str, default="/run/llama-chat-7b-hf", help="model name"
+-    )
+-    parser.add_argument(
+-        "--output-path",
+-        type=str,
+-        default="act_scales/opt-1.3b.pt",
+-        help="where to save the act scales",
+-    )
+-    parser.add_argument(
+-        "--dataset-path",
+-        type=str,
+-        default="/root/zanilia/alpaca-lora/alpaca_data.json",
+-        help="location of the calibration dataset, we use the validation set of the Pile dataset",
+-    )
+-    parser.add_argument("--num-samples", type=int, default=512)
+-    parser.add_argument("--seq-len", type=int, default=512)
+-    args = parser.parse_args()
+-    return args
+-
+-
+-@torch.no_grad()
+-def main():
+-    args = parse_args()
+-    model, tokenizer = build_model_and_tokenizer(args.model_name)
+-
+-    act_scales = get_act_scales(
+-        model, tokenizer, args.dataset_path, args.num_samples, args.seq_len
+-    )
+-
+-    os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
+-    torch.save(act_scales, args.output_path)
+-
+-
+-if __name__ == "__main__":
+-    main()
+diff -uNr ascend-llm/export_llama/modeling_llama_4.35.py ascend-llm-qwen/export_llama/modeling_llama_4.35.py
+--- ascend-llm/export_llama/modeling_llama_4.35.py	2024-09-04 19:21:03.078081000 +0800
++++ ascend-llm-qwen/export_llama/modeling_llama_4.35.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,1264 +0,0 @@
+-# coding=utf-8
+-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+-#
+-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+-# and OPT implementations in this library. It has been modified from its
+-# original forms to accommodate minor architectural differences compared
+-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#     http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-""" PyTorch LLaMA model."""
+-import math
+-import warnings
+-from typing import List, Optional, Tuple, Union
+-
+-import torch
+-import torch.nn.functional as F
+-import torch.utils.checkpoint
+-from torch import nn
+-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+-
+-from ...activations import ACT2FN
+-from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+-from ...modeling_utils import PreTrainedModel
+-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+-from ...utils import (
+-    add_start_docstrings,
+-    add_start_docstrings_to_model_forward,
+-    is_flash_attn_2_available,
+-    logging,
+-    replace_return_docstrings,
+-)
+-from ...utils.import_utils import is_torch_fx_available
+-from .configuration_llama import LlamaConfig
+-
+-
+-if is_flash_attn_2_available():
+-    from flash_attn import flash_attn_func, flash_attn_varlen_func
+-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+-
+-
+-# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+-# It means that the function will not be traced through and simply appear as a node in the graph.
+-if is_torch_fx_available():
+-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+-
+-
+-logger = logging.get_logger(__name__)
+-
+-_CONFIG_FOR_DOC = "LlamaConfig"
+-
+-
+-def _get_unpad_data(attention_mask):
+-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+-    max_seqlen_in_batch = seqlens_in_batch.max().item()
+-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+-    return (
+-        indices,
+-        cu_seqlens,
+-        max_seqlen_in_batch,
+-    )
+-
+-
+-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+-    warnings.warn(
+-        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask"
+-    )
+-    return AttentionMaskConverter._prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+-
+-
+-def _make_causal_mask(
+-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+-):
+-    warnings.warn(
+-        "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask"
+-    )
+-    return AttentionMaskConverter._make_causal_mask(
+-        input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
+-    )
+-
+-
+-class LlamaRMSNorm(nn.Module):
+-    def __init__(self, hidden_size, eps=1e-6):
+-        """
+-        LlamaRMSNorm is equivalent to T5LayerNorm
+-        """
+-        super().__init__()
+-        self.weight = nn.Parameter(torch.ones(hidden_size))
+-        self.variance_epsilon = eps
+-
+-    def forward(self, hidden_states):
+-        input_dtype = hidden_states.dtype
+-        hidden_states = hidden_states.to(torch.float32)
+-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+-        return self.weight * hidden_states.to(input_dtype)
+-
+-
+-ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
+-
+-
+-class LlamaRotaryEmbedding(nn.Module):
+-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+-        super().__init__()
+-
+-        self.dim = dim
+-        self.max_position_embeddings = max_position_embeddings
+-        self.base = base
+-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+-        self.register_buffer("inv_freq", inv_freq, persistent=False)
+-
+-        # Build here to make `torch.jit.trace` work.
+-        self._set_cos_sin_cache(
+-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+-        )
+-
+-    def _set_cos_sin_cache(self, seq_len, device, dtype):
+-        self.max_seq_len_cached = seq_len
+-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+-
+-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+-        emb = torch.cat((freqs, freqs), dim=-1)
+-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+-
+-    def forward(self, x, seq_len=None):
+-        return (
+-            self.cos_cached.to(dtype=x.dtype),
+-            self.sin_cached.to(dtype=x.dtype),
+-        )
+-        # x: [bs, num_attention_heads, seq_len, head_size]
+-        if seq_len > self.max_seq_len_cached:
+-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+-
+-        return (
+-            self.cos_cached[:seq_len].to(dtype=x.dtype),
+-            self.sin_cached[:seq_len].to(dtype=x.dtype),
+-        )
+-
+-
+-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+-
+-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+-        self.scaling_factor = scaling_factor
+-        super().__init__(dim, max_position_embeddings, base, device)
+-
+-    def _set_cos_sin_cache(self, seq_len, device, dtype):
+-        self.max_seq_len_cached = seq_len
+-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+-        t = t / self.scaling_factor
+-
+-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+-        emb = torch.cat((freqs, freqs), dim=-1)
+-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+-
+-
+-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+-
+-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+-        self.scaling_factor = scaling_factor
+-        super().__init__(dim, max_position_embeddings, base, device)
+-
+-    def _set_cos_sin_cache(self, seq_len, device, dtype):
+-        self.max_seq_len_cached = seq_len
+-
+-        if seq_len > self.max_position_embeddings:
+-            base = self.base * (
+-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+-            ) ** (self.dim / (self.dim - 2))
+-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+-            self.register_buffer("inv_freq", inv_freq, persistent=False)
+-
+-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+-
+-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+-        emb = torch.cat((freqs, freqs), dim=-1)
+-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+-
+-
+-def rotate_half(x):
+-    """Rotates half the hidden dims of the input."""
+-    x1 = x[..., : x.shape[-1] // 2]
+-    x2 = x[..., x.shape[-1] // 2 :]
+-    return torch.cat((-x2, x1), dim=-1)
+-
+-
+-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+-    """Applies Rotary Position Embedding to the query and key tensors.
+-
+-    Args:
+-        q (`torch.Tensor`): The query tensor.
+-        k (`torch.Tensor`): The key tensor.
+-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+-        sin (`torch.Tensor`): The sine part of the rotary embedding.
+-        position_ids (`torch.Tensor`):
+-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+-            used to pass offsetted position ids when working with a KV-cache.
+-        unsqueeze_dim (`int`, *optional*, defaults to 1):
+-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+-    Returns:
+-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+-    """
+-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+-    q_embed = (q * cos) + (rotate_half(q) * sin)
+-    k_embed = (k * cos) + (rotate_half(k) * sin)
+-    return q_embed, k_embed
+-
+-
+-class LlamaMLP(nn.Module):
+-    def __init__(self, config):
+-        super().__init__()
+-        self.config = config
+-        self.hidden_size = config.hidden_size
+-        self.intermediate_size = config.intermediate_size
+-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+-        self.act_fn = ACT2FN[config.hidden_act]
+-
+-    def forward(self, x):
+-        if self.config.pretraining_tp > 1:
+-            slice = self.intermediate_size // self.config.pretraining_tp
+-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+-
+-            gate_proj = torch.cat(
+-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+-            )
+-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+-
+-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+-            down_proj = [
+-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+-            ]
+-            down_proj = sum(down_proj)
+-        else:
+-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+-
+-        return down_proj
+-
+-
+-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+-    """
+-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+-    """
+-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+-    if n_rep == 1:
+-        return hidden_states
+-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+-
+-
+-class LlamaAttention(nn.Module):
+-    """Multi-headed attention from 'Attention Is All You Need' paper"""
+-
+-    def __init__(self, config: LlamaConfig):
+-        super().__init__()
+-        self.config = config
+-        self.hidden_size = config.hidden_size
+-        self.num_heads = config.num_attention_heads
+-        self.head_dim = self.hidden_size // self.num_heads
+-        self.num_key_value_heads = config.num_key_value_heads
+-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+-        self.max_position_embeddings = config.max_position_embeddings
+-        self.rope_theta = config.rope_theta
+-        self.is_causal = True
+-
+-        if (self.head_dim * self.num_heads) != self.hidden_size:
+-            raise ValueError(
+-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+-                f" and `num_heads`: {self.num_heads})."
+-            )
+-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+-        self._init_rope()
+-
+-    def _init_rope(self):
+-        if self.config.rope_scaling is None:
+-            self.rotary_emb = LlamaRotaryEmbedding(
+-                self.head_dim,
+-                max_position_embeddings=self.max_position_embeddings,
+-                base=self.rope_theta,
+-            )
+-        else:
+-            scaling_type = self.config.rope_scaling["type"]
+-            scaling_factor = self.config.rope_scaling["factor"]
+-            if scaling_type == "linear":
+-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+-                    self.head_dim,
+-                    max_position_embeddings=self.max_position_embeddings,
+-                    scaling_factor=scaling_factor,
+-                    base=self.rope_theta,
+-                )
+-            elif scaling_type == "dynamic":
+-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+-                    self.head_dim,
+-                    max_position_embeddings=self.max_position_embeddings,
+-                    scaling_factor=scaling_factor,
+-                    base=self.rope_theta,
+-                )
+-            else:
+-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+-
+-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+-
+-    def forward(
+-        self,
+-        hidden_states: torch.Tensor,
+-        attention_mask: Optional[torch.Tensor] = None,
+-        position_ids: Optional[torch.LongTensor] = None,
+-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+-        output_attentions: bool = False,
+-        use_cache: bool = False,
+-        **kwargs,
+-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+-        if "padding_mask" in kwargs:
+-            warnings.warn(
+-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+-            )
+-
+-        bsz, q_len, _ = hidden_states.size()
+-
+-        if self.config.pretraining_tp > 1:
+-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+-            query_slices = self.q_proj.weight.split(
+-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+-            )
+-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+-
+-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+-            query_states = torch.cat(query_states, dim=-1)
+-
+-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+-            key_states = torch.cat(key_states, dim=-1)
+-
+-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+-            value_states = torch.cat(value_states, dim=-1)
+-
+-        else:
+-            query_states = self.q_proj(hidden_states)
+-            key_states = self.k_proj(hidden_states)
+-            value_states = self.v_proj(hidden_states)
+-
+-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+-
+-        kv_seq_len = key_states.shape[-2]
+-        if past_key_value is not None:
+-            kv_seq_len += past_key_value[0].shape[-2]
+-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+-
+-        out_key_value = (key_states, value_states) if use_cache else None
+-
+-        if past_key_value is not None:
+-            # reuse k, v, self_attention
+-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+-
+-
+-        key_states = repeat_kv(key_states, self.num_key_value_groups)
+-        value_states = repeat_kv(value_states, self.num_key_value_groups)
+-
+-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+-
+-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+-            raise ValueError(
+-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+-                f" {attn_weights.size()}"
+-            )
+-
+-        if attention_mask is not None:
+-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+-                raise ValueError(
+-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+-                )
+-            attn_weights = attn_weights + attention_mask
+-
+-        # upcast attention to fp32
+-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+-        attn_output = torch.matmul(attn_weights, value_states)
+-
+-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+-            raise ValueError(
+-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+-                f" {attn_output.size()}"
+-            )
+-
+-        attn_output = attn_output.transpose(1, 2).contiguous()
+-
+-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+-
+-        if self.config.pretraining_tp > 1:
+-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+-        else:
+-            attn_output = self.o_proj(attn_output)
+-
+-        if not output_attentions:
+-            attn_weights = None
+-
+-        return attn_output, attn_weights, out_key_value
+-
+-
+-class LlamaFlashAttention2(LlamaAttention):
+-    """
+-    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+-    flash attention and deal with padding tokens in case the input contains any of them.
+-    """
+-
+-    def forward(
+-        self,
+-        hidden_states: torch.Tensor,
+-        attention_mask: Optional[torch.LongTensor] = None,
+-        position_ids: Optional[torch.LongTensor] = None,
+-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+-        output_attentions: bool = False,
+-        use_cache: bool = False,
+-        **kwargs,
+-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+-        # LlamaFlashAttention2 attention does not support output_attentions
+-        if "padding_mask" in kwargs:
+-            warnings.warn(
+-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+-            )
+-
+-            # overwrite attention_mask with padding_mask
+-            attention_mask = kwargs.pop("padding_mask")
+-
+-        output_attentions = False
+-
+-        bsz, q_len, _ = hidden_states.size()
+-
+-        query_states = self.q_proj(hidden_states)
+-        key_states = self.k_proj(hidden_states)
+-        value_states = self.v_proj(hidden_states)
+-
+-        # Flash attention requires the input to have the shape
+-        # batch_size x seq_length x head_dim x hidden_dim
+-        # therefore we just need to keep the original shape
+-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+-
+-        kv_seq_len = key_states.shape[-2]
+-        if past_key_value is not None:
+-            kv_seq_len += past_key_value[0].shape[-2]
+-
+-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+-
+-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+-
+-        if past_key_value is not None:
+-            # reuse k, v, self_attention
+-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+-
+-        past_key_value = (key_states, value_states) if use_cache else None
+-
+-        query_states = query_states.transpose(1, 2)
+-        key_states = key_states.transpose(1, 2)
+-        value_states = value_states.transpose(1, 2)
+-
+-        # TODO: llama does not have dropout in the config??
+-        # It is recommended to use dropout with FA according to the docs
+-        # when training.
+-        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+-
+-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+-        # therefore the input hidden states gets silently casted in float32. Hence, we need
+-        # cast them back in the correct dtype just to be sure everything works as expected.
+-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+-        # in fp32. (LlamaRMSNorm handles it correctly)
+-
+-        input_dtype = query_states.dtype
+-        if input_dtype == torch.float32:
+-            # Handle the case where the model is quantized
+-            if hasattr(self.config, "_pre_quantization_dtype"):
+-                target_dtype = self.config._pre_quantization_dtype
+-            else:
+-                target_dtype = self.q_proj.weight.dtype
+-
+-            logger.warning_once(
+-                f"The input hidden states seems to be silently casted in float32, this might be related to"
+-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+-                f" {target_dtype}."
+-            )
+-
+-            query_states = query_states.to(target_dtype)
+-            key_states = key_states.to(target_dtype)
+-            value_states = value_states.to(target_dtype)
+-
+-        attn_output = self._flash_attention_forward(
+-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+-        )
+-
+-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+-        attn_output = self.o_proj(attn_output)
+-
+-        if not output_attentions:
+-            attn_weights = None
+-
+-        return attn_output, attn_weights, past_key_value
+-
+-    def _flash_attention_forward(
+-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+-    ):
+-        """
+-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+-        first unpad the input, then computes the attention scores and pad the final attention scores.
+-
+-        Args:
+-            query_states (`torch.Tensor`):
+-                Input query states to be passed to Flash Attention API
+-            key_states (`torch.Tensor`):
+-                Input key states to be passed to Flash Attention API
+-            value_states (`torch.Tensor`):
+-                Input value states to be passed to Flash Attention API
+-            attention_mask (`torch.Tensor`):
+-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+-                position of padding tokens and 1 for the position of non-padding tokens.
+-            dropout (`int`, *optional*):
+-                Attention dropout
+-            softmax_scale (`float`, *optional*):
+-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+-        """
+-        # Contains at least one padding token in the sequence
+-        if attention_mask is not None:
+-            batch_size = query_states.shape[0]
+-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+-                query_states, key_states, value_states, attention_mask, query_length
+-            )
+-
+-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+-
+-            attn_output_unpad = flash_attn_varlen_func(
+-                query_states,
+-                key_states,
+-                value_states,
+-                cu_seqlens_q=cu_seqlens_q,
+-                cu_seqlens_k=cu_seqlens_k,
+-                max_seqlen_q=max_seqlen_in_batch_q,
+-                max_seqlen_k=max_seqlen_in_batch_k,
+-                dropout_p=dropout,
+-                softmax_scale=softmax_scale,
+-                causal=self.is_causal,
+-            )
+-
+-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+-        else:
+-            attn_output = flash_attn_func(
+-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+-            )
+-
+-        return attn_output
+-
+-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+-
+-        key_layer = index_first_axis(
+-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+-        )
+-        value_layer = index_first_axis(
+-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+-        )
+-        if query_length == kv_seq_len:
+-            query_layer = index_first_axis(
+-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+-            )
+-            cu_seqlens_q = cu_seqlens_k
+-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+-            indices_q = indices_k
+-        elif query_length == 1:
+-            max_seqlen_in_batch_q = 1
+-            cu_seqlens_q = torch.arange(
+-                batch_size + 1, dtype=torch.int32, device=query_layer.device
+-            )  # There is a memcpy here, that is very bad.
+-            indices_q = cu_seqlens_q[:-1]
+-            query_layer = query_layer.squeeze(1)
+-        else:
+-            # The -q_len: slice assumes left padding.
+-            attention_mask = attention_mask[:, -query_length:]
+-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+-
+-        return (
+-            query_layer,
+-            key_layer,
+-            value_layer,
+-            indices_q,
+-            (cu_seqlens_q, cu_seqlens_k),
+-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+-        )
+-
+-
+-class LlamaDecoderLayer(nn.Module):
+-    def __init__(self, config: LlamaConfig):
+-        super().__init__()
+-        self.hidden_size = config.hidden_size
+-        self.self_attn = (
+-            LlamaAttention(config=config)
+-            if not getattr(config, "_flash_attn_2_enabled", False)
+-            else LlamaFlashAttention2(config=config)
+-        )
+-        self.mlp = LlamaMLP(config)
+-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+-
+-    def forward(
+-        self,
+-        hidden_states: torch.Tensor,
+-        attention_mask: Optional[torch.Tensor] = None,
+-        position_ids: Optional[torch.LongTensor] = None,
+-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+-        output_attentions: Optional[bool] = False,
+-        use_cache: Optional[bool] = False,
+-        **kwargs,
+-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+-        """
+-        Args:
+-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+-            attention_mask (`torch.FloatTensor`, *optional*):
+-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+-                query_sequence_length, key_sequence_length)` if default attention is used.
+-            output_attentions (`bool`, *optional*):
+-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+-                returned tensors for more detail.
+-            use_cache (`bool`, *optional*):
+-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+-                (see `past_key_values`).
+-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+-        """
+-        if "padding_mask" in kwargs:
+-            warnings.warn(
+-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+-            )
+-
+-        residual = hidden_states
+-
+-        hidden_states = self.input_layernorm(hidden_states)
+-
+-        # Self Attention
+-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+-            hidden_states=hidden_states,
+-            attention_mask=attention_mask,
+-            position_ids=position_ids,
+-            past_key_value=past_key_value,
+-            output_attentions=output_attentions,
+-            use_cache=use_cache,
+-            **kwargs,
+-        )
+-        hidden_states = residual + hidden_states
+-
+-        # Fully Connected
+-        residual = hidden_states
+-        hidden_states = self.post_attention_layernorm(hidden_states)
+-        hidden_states = self.mlp(hidden_states)
+-        hidden_states = residual + hidden_states
+-
+-        outputs = (hidden_states,)
+-
+-        if output_attentions:
+-            outputs += (self_attn_weights,)
+-
+-        if use_cache:
+-            outputs += (present_key_value,)
+-
+-        return outputs
+-
+-
+-LLAMA_START_DOCSTRING = r"""
+-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+-    etc.)
+-
+-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+-    and behavior.
+-
+-    Parameters:
+-        config ([`LlamaConfig`]):
+-            Model configuration class with all the parameters of the model. Initializing with a config file does not
+-            load the weights associated with the model, only the configuration. Check out the
+-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+-"""
+-
+-
+-@add_start_docstrings(
+-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+-    LLAMA_START_DOCSTRING,
+-)
+-class LlamaPreTrainedModel(PreTrainedModel):
+-    config_class = LlamaConfig
+-    base_model_prefix = "model"
+-    supports_gradient_checkpointing = True
+-    _no_split_modules = ["LlamaDecoderLayer"]
+-    _skip_keys_device_placement = "past_key_values"
+-    _supports_flash_attn_2 = True
+-
+-    def _init_weights(self, module):
+-        std = self.config.initializer_range
+-        if isinstance(module, nn.Linear):
+-            module.weight.data.normal_(mean=0.0, std=std)
+-            if module.bias is not None:
+-                module.bias.data.zero_()
+-        elif isinstance(module, nn.Embedding):
+-            module.weight.data.normal_(mean=0.0, std=std)
+-            if module.padding_idx is not None:
+-                module.weight.data[module.padding_idx].zero_()
+-
+-
+-LLAMA_INPUTS_DOCSTRING = r"""
+-    Args:
+-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+-            it.
+-
+-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+-            [`PreTrainedTokenizer.__call__`] for details.
+-
+-            [What are input IDs?](../glossary#input-ids)
+-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+-
+-            - 1 for tokens that are **not masked**,
+-            - 0 for tokens that are **masked**.
+-
+-            [What are attention masks?](../glossary#attention-mask)
+-
+-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+-            [`PreTrainedTokenizer.__call__`] for details.
+-
+-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+-            `past_key_values`).
+-
+-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+-            information on the default strategy.
+-
+-            - 1 indicates the head is **not masked**,
+-            - 0 indicates the head is **masked**.
+-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+-            config.n_positions - 1]`.
+-
+-            [What are position IDs?](../glossary#position-ids)
+-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+-            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+-
+-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+-
+-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+-            of shape `(batch_size, sequence_length)`.
+-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+-            model's internal embedding lookup matrix.
+-        use_cache (`bool`, *optional*):
+-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+-            `past_key_values`).
+-        output_attentions (`bool`, *optional*):
+-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+-            tensors for more detail.
+-        output_hidden_states (`bool`, *optional*):
+-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+-            more detail.
+-        return_dict (`bool`, *optional*):
+-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+-"""
+-
+-
+-@add_start_docstrings(
+-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+-    LLAMA_START_DOCSTRING,
+-)
+-class LlamaModel(LlamaPreTrainedModel):
+-    """
+-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+-
+-    Args:
+-        config: LlamaConfig
+-    """
+-
+-    def __init__(self, config: LlamaConfig):
+-        super().__init__(config)
+-        self.padding_idx = config.pad_token_id
+-        self.vocab_size = config.vocab_size
+-
+-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+-
+-        self.gradient_checkpointing = False
+-        # Initialize weights and apply final processing
+-        self.post_init()
+-
+-    def get_input_embeddings(self):
+-        return self.embed_tokens
+-
+-    def set_input_embeddings(self, value):
+-        self.embed_tokens = value
+-
+-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+-    def forward(
+-        self,
+-        input_ids: torch.LongTensor = None,
+-        attention_mask: Optional[torch.Tensor] = None,
+-        position_ids: Optional[torch.LongTensor] = None,
+-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+-        inputs_embeds: Optional[torch.FloatTensor] = None,
+-        use_cache: Optional[bool] = None,
+-        output_attentions: Optional[bool] = None,
+-        output_hidden_states: Optional[bool] = None,
+-        return_dict: Optional[bool] = None,
+-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+-        output_hidden_states = (
+-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+-        )
+-        use_cache = use_cache if use_cache is not None else self.config.use_cache
+-
+-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+-        # retrieve input_ids and inputs_embeds
+-        if input_ids is not None and inputs_embeds is not None:
+-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+-        elif input_ids is not None:
+-            batch_size, seq_length = input_ids.shape[:2]
+-        elif inputs_embeds is not None:
+-            batch_size, seq_length = inputs_embeds.shape[:2]
+-        else:
+-            raise ValueError("You have to specify either input_ids or inputs_embeds")
+-
+-        past_key_values_length = 0
+-        if past_key_values is not None:
+-            past_key_values_length = past_key_values[0][0].shape[2]
+-        
+-        # new_key_values_shape=past_key_values.shape
+-        # new_key_values_shape[-2]=seq_length
+-        # next_decoder_cache=torch.empty(new_key_values_shape,dtype=past_key_values[0][0].dtype)
+-
+-
+-        if position_ids is None:
+-            device = input_ids.device if input_ids is not None else inputs_embeds.device
+-            position_ids = torch.arange(
+-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+-            )
+-            position_ids = position_ids.unsqueeze(0)
+-
+-        if inputs_embeds is None:
+-            inputs_embeds = self.embed_tokens(input_ids)
+-
+-        if getattr(self.config, "_flash_attn_2_enabled", False):
+-            # 2d mask is passed through the layers
+-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+-        else:
+-            # 4d mask is passed through the layers
+-            attention_mask = _prepare_4d_causal_attention_mask(
+-                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+-            )
+-
+-        # embed positions
+-        hidden_states = inputs_embeds
+-
+-        if self.gradient_checkpointing and self.training:
+-            if use_cache:
+-                logger.warning_once(
+-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+-                )
+-                use_cache = False
+-
+-        # decoder layers
+-        all_hidden_states = () if output_hidden_states else None
+-        all_self_attns = () if output_attentions else None
+-        next_decoder_cache = [] if use_cache else None
+-
+-        for idx, decoder_layer in enumerate(self.layers):
+-            if output_hidden_states:
+-                all_hidden_states += (hidden_states,)
+-
+-            past_key_value = past_key_values[idx] if past_key_values is not None else None
+-
+-            if self.gradient_checkpointing and self.training:
+-                layer_outputs = self._gradient_checkpointing_func(
+-                    decoder_layer.__call__,
+-                    hidden_states,
+-                    attention_mask,
+-                    position_ids,
+-                    past_key_value,
+-                    output_attentions,
+-                    use_cache,
+-                )
+-            else:
+-                layer_outputs = decoder_layer(
+-                    hidden_states,
+-                    attention_mask=attention_mask,
+-                    position_ids=position_ids,
+-                    past_key_value=past_key_value,
+-                    output_attentions=output_attentions,
+-                    use_cache=use_cache,
+-                )
+-
+-            hidden_states = layer_outputs[0]
+-
+-            if use_cache:
+-                key_values= layer_outputs[2 if output_attentions else 1]
+-                # next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+-                # next_decoder_cache[idx][0] = key_values[0]
+-                # next_decoder_cache[idx][1] = key_values[1]
+-                next_decoder_cache.extend(layer_outputs[2 if output_attentions else 1])
+-
+-            if output_attentions:
+-                all_self_attns += (layer_outputs[1],)
+-
+-        hidden_states = self.norm(hidden_states)
+-
+-        # add hidden states from the last decoder layer
+-        if output_hidden_states:
+-            all_hidden_states += (hidden_states,)
+-
+-        next_cache = torch.concat(next_decoder_cache).reshape(len(self.layers),2,*next_decoder_cache[0].shape) if use_cache else None
+-        if output_attentions:
+-            all_self_attns = torch.concat(all_self_attns).reshape(len(self.layers),*all_self_attns[0].shape)
+-        if not return_dict:
+-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+-        return BaseModelOutputWithPast(
+-            last_hidden_state=hidden_states,
+-            past_key_values=next_cache,
+-            hidden_states=all_hidden_states,
+-            attentions=all_self_attns,
+-        )
+-
+-
+-class LlamaForCausalLM(LlamaPreTrainedModel):
+-    _tied_weights_keys = ["lm_head.weight"]
+-
+-    def __init__(self, config):
+-        super().__init__(config)
+-        self.model = LlamaModel(config)
+-        self.vocab_size = config.vocab_size
+-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+-
+-        # Initialize weights and apply final processing
+-        self.post_init()
+-
+-    def get_input_embeddings(self):
+-        return self.model.embed_tokens
+-
+-    def set_input_embeddings(self, value):
+-        self.model.embed_tokens = value
+-
+-    def get_output_embeddings(self):
+-        return self.lm_head
+-
+-    def set_output_embeddings(self, new_embeddings):
+-        self.lm_head = new_embeddings
+-
+-    def set_decoder(self, decoder):
+-        self.model = decoder
+-
+-    def get_decoder(self):
+-        return self.model
+-
+-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+-    def forward(
+-        self,
+-        input_ids: torch.LongTensor = None,
+-        attention_mask: Optional[torch.Tensor] = None,
+-        position_ids: Optional[torch.LongTensor] = None,
+-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+-        inputs_embeds: Optional[torch.FloatTensor] = None,
+-        labels: Optional[torch.LongTensor] = None,
+-        use_cache: Optional[bool] = None,
+-        output_attentions: Optional[bool] = None,
+-        output_hidden_states: Optional[bool] = None,
+-        return_dict: Optional[bool] = None,
+-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+-        r"""
+-        Args:
+-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+-
+-        Returns:
+-
+-        Example:
+-
+-        ```python
+-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+-
+-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+-
+-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+-        >>> inputs = tokenizer(prompt, return_tensors="pt")
+-
+-        >>> # Generate
+-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+-        ```"""
+-
+-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+-        output_hidden_states = (
+-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+-        )
+-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+-        outputs = self.model(
+-            input_ids=input_ids,
+-            attention_mask=attention_mask,
+-            position_ids=position_ids,
+-            past_key_values=past_key_values,
+-            inputs_embeds=inputs_embeds,
+-            use_cache=use_cache,
+-            output_attentions=output_attentions,
+-            output_hidden_states=output_hidden_states,
+-            return_dict=return_dict,
+-        )
+-
+-        hidden_states = outputs[0]
+-        if self.config.pretraining_tp > 1:
+-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+-            logits = torch.cat(logits, dim=-1)
+-        else:
+-            logits = self.lm_head(hidden_states)
+-        logits = logits.float()
+-
+-        loss = None
+-        if labels is not None:
+-            # Shift so that tokens < n predict n
+-            shift_logits = logits[..., :-1, :].contiguous()
+-            shift_labels = labels[..., 1:].contiguous()
+-            # Flatten the tokens
+-            loss_fct = CrossEntropyLoss()
+-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+-            shift_labels = shift_labels.view(-1)
+-            # Enable model parallelism
+-            shift_labels = shift_labels.to(shift_logits.device)
+-            loss = loss_fct(shift_logits, shift_labels)
+-
+-        if not return_dict:
+-            output = (logits,) + outputs[1:]
+-            return (loss,) + output if loss is not None else output
+-
+-        return CausalLMOutputWithPast(
+-            loss=loss,
+-            logits=logits,
+-            past_key_values=outputs.past_key_values,
+-            hidden_states=outputs.hidden_states,
+-            attentions=outputs.attentions,
+-        )
+-
+-    def prepare_inputs_for_generation(
+-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+-    ):
+-        if past_key_values is not None:
+-            past_length = past_key_values[0][0].shape[2]
+-
+-            # Some generation methods already pass only the last input ID
+-            if input_ids.shape[1] > past_length:
+-                remove_prefix_length = past_length
+-            else:
+-                # Default to old behavior: keep only final ID
+-                remove_prefix_length = input_ids.shape[1] - 1
+-
+-            input_ids = input_ids[:, remove_prefix_length:]
+-
+-        position_ids = kwargs.get("position_ids", None)
+-        if attention_mask is not None and position_ids is None:
+-            # create position_ids on the fly for batch generation
+-            position_ids = attention_mask.long().cumsum(-1) - 1
+-            position_ids.masked_fill_(attention_mask == 0, 1)
+-            if past_key_values:
+-                position_ids = position_ids[:, -input_ids.shape[1] :]
+-
+-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+-        if inputs_embeds is not None and past_key_values is None:
+-            model_inputs = {"inputs_embeds": inputs_embeds}
+-        else:
+-            model_inputs = {"input_ids": input_ids}
+-
+-        model_inputs.update(
+-            {
+-                "position_ids": position_ids,
+-                "past_key_values": past_key_values,
+-                "use_cache": kwargs.get("use_cache"),
+-                "attention_mask": attention_mask,
+-            }
+-        )
+-        return model_inputs
+-
+-    @staticmethod
+-    def _reorder_cache(past_key_values, beam_idx):
+-        reordered_past = ()
+-        for layer_past in past_key_values:
+-            reordered_past += (
+-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+-            )
+-        return reordered_past
+-
+-
+-@add_start_docstrings(
+-    """
+-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+-
+-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+-    (e.g. GPT-2) do.
+-
+-    Since it does classification on the last token, it requires to know the position of the last token. If a
+-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+-    each row of the batch).
+-    """,
+-    LLAMA_START_DOCSTRING,
+-)
+-class LlamaForSequenceClassification(LlamaPreTrainedModel):
+-    def __init__(self, config):
+-        super().__init__(config)
+-        self.num_labels = config.num_labels
+-        self.model = LlamaModel(config)
+-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+-
+-        # Initialize weights and apply final processing
+-        self.post_init()
+-
+-    def get_input_embeddings(self):
+-        return self.model.embed_tokens
+-
+-    def set_input_embeddings(self, value):
+-        self.model.embed_tokens = value
+-
+-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+-    def forward(
+-        self,
+-        input_ids: torch.LongTensor = None,
+-        attention_mask: Optional[torch.Tensor] = None,
+-        position_ids: Optional[torch.LongTensor] = None,
+-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+-        inputs_embeds: Optional[torch.FloatTensor] = None,
+-        labels: Optional[torch.LongTensor] = None,
+-        use_cache: Optional[bool] = None,
+-        output_attentions: Optional[bool] = None,
+-        output_hidden_states: Optional[bool] = None,
+-        return_dict: Optional[bool] = None,
+-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+-        r"""
+-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+-        """
+-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+-        transformer_outputs = self.model(
+-            input_ids,
+-            attention_mask=attention_mask,
+-            position_ids=position_ids,
+-            past_key_values=past_key_values,
+-            inputs_embeds=inputs_embeds,
+-            use_cache=use_cache,
+-            output_attentions=output_attentions,
+-            output_hidden_states=output_hidden_states,
+-            return_dict=return_dict,
+-        )
+-        hidden_states = transformer_outputs[0]
+-        logits = self.score(hidden_states)
+-
+-        if input_ids is not None:
+-            batch_size = input_ids.shape[0]
+-        else:
+-            batch_size = inputs_embeds.shape[0]
+-
+-        if self.config.pad_token_id is None and batch_size != 1:
+-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+-        if self.config.pad_token_id is None:
+-            sequence_lengths = -1
+-        else:
+-            if input_ids is not None:
+-                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+-                    logits.device
+-                )
+-            else:
+-                sequence_lengths = -1
+-
+-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+-
+-        loss = None
+-        if labels is not None:
+-            labels = labels.to(logits.device)
+-            if self.config.problem_type is None:
+-                if self.num_labels == 1:
+-                    self.config.problem_type = "regression"
+-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+-                    self.config.problem_type = "single_label_classification"
+-                else:
+-                    self.config.problem_type = "multi_label_classification"
+-
+-            if self.config.problem_type == "regression":
+-                loss_fct = MSELoss()
+-                if self.num_labels == 1:
+-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+-                else:
+-                    loss = loss_fct(pooled_logits, labels)
+-            elif self.config.problem_type == "single_label_classification":
+-                loss_fct = CrossEntropyLoss()
+-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+-            elif self.config.problem_type == "multi_label_classification":
+-                loss_fct = BCEWithLogitsLoss()
+-                loss = loss_fct(pooled_logits, labels)
+-        if not return_dict:
+-            output = (pooled_logits,) + transformer_outputs[1:]
+-            return ((loss,) + output) if loss is not None else output
+-
+-        return SequenceClassifierOutputWithPast(
+-            loss=loss,
+-            logits=pooled_logits,
+-            past_key_values=transformer_outputs.past_key_values,
+-            hidden_states=transformer_outputs.hidden_states,
+-            attentions=transformer_outputs.attentions,
+-        )
+diff -uNr ascend-llm/export_llama/quantize.py ascend-llm-qwen/export_llama/quantize.py
+--- ascend-llm/export_llama/quantize.py	2024-09-04 19:21:03.078081000 +0800
++++ ascend-llm-qwen/export_llama/quantize.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,183 +0,0 @@
+-import torch
+-from torch import nn,Tensor
+-from typing import Optional,List,Tuple
+-from torch.onnx.symbolic_helper import parse_args
+-
+-class MatMulInteger(torch.autograd.Function):
+-    @staticmethod
+-    def forward(ctx, x:torch.Tensor,weight_t:torch.Tensor):               
+-        res = torch.matmul(x.to(dtype=torch.float32),weight_t.to(torch.float32))
+-        # res=torch.matmul(x.to(dtype=torch.int32,device="cpu") , # torch不支持CUDA上的int8矩阵乘
+-        #                             weight_t.to(dtype=torch.int32,device="cpu")).to(x.device)
+-        return res
+-
+-    @staticmethod
+-    @parse_args("v","v")
+-    def symbolic(g:torch._C.Graph, x:torch.Tensor,weight_t:torch.Tensor):
+-        return g.op("MatMulInteger", x,weight_t)
+-
+-matmulInteger = MatMulInteger.apply
+-
+-def quantize_mat(mat:Tensor)-> Tuple[Tensor,Tensor]:
+-    # max_val = torch.max(torch.abs(mat),dim=-1)[0]
+-    # mat =  (mat * (127 / max_val)[...,None]).to(dtype=torch.int8)
+-    max_val = (torch.max(torch.abs(mat),dim=-1)[0] / 127.0).to(dtype=mat.dtype)
+-    mat =  (mat / max_val[...,None]).to(dtype=torch.int8)
+-    return mat, max_val
+-
+-def dequantize_mat(mat:Tensor,max_val:Tensor):
+-    return torch.mul(mat,max_val.unsqueeze(-1))
+-
+-def decomposition(mat:Tensor,unq_idx:Tensor,t:Tensor) -> Tuple[Tensor,Tensor,Tensor,Tensor]:
+-    return mat.mul(t.to(dtype=mat.dtype)),mat[...,unq_idx]
+-    mat=mat.clone()
+-    mat_unq = mat[...,unq_idx]
+-    if mat.dim() == 3:
+-        mat[:,:,unq_idx] = 0
+-    elif mat.dim() == 4:
+-        mat[:,:,:,unq_idx] = 0
+-    elif mat.dim() == 2:
+-        mat[:,unq_idx] = 0
+-    return mat,mat_unq
+-
+-def get_unq_idx_topk(mat:Tensor,k:int=64):
+-    idx=torch.topk(mat.view(-1,mat.shape[-1]).abs().max(dim=-2)[0],k,dim=-1)[1]
+-    t = torch.ones((mat.shape[-1]),dtype=mat.dtype,device=mat.device)
+-    t = t.clone()
+-    t[idx] = 0
+-    return idx,t
+-
+-def get_unq_idx_thres(mat:Tensor,threshold:float=6.0):
+-    k = mat.view(-1,mat.shape[-1]).abs().max(dim=-2)[0] >= threshold
+-    return k.nonzero().view(-1), k
+-
+-def qMatmul(x_q:Tensor,x_max:Tensor,weight_q:Tensor,w_max:Tensor,dtype):
+-    res_q = matmulInteger(x_q , weight_q)
+-    mx = nn.functional.linear(x_max.unsqueeze(-1),w_max.unsqueeze(-1))
+-    res = torch.mul(res_q.to(device=mx.device,dtype=torch.float32), mx.to(torch.float32) ).to(dtype=dtype)  
+-    # res = torch.mul((res_q.to(device=mx.device,dtype=torch.float32) / (127.0*127.0)).to(torch.float16), mx )  
+-    return res
+-
+-class W8Linear(nn.Module):
+-    def __init__(self, origin_weight:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
+-        super().__init__()
+-        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
+-        self.dtype = origin_weight.dtype
+-        self.alpha = alpha
+-        self.weight_q,self.max_val = quantize_mat(origin_weight.detach())
+-        self.weight_q = nn.Parameter(self.weight_q,requires_grad=False)
+-        self.max_val = nn.Parameter(self.max_val,requires_grad=False)
+-
+-    def forward(self,x:Tensor) -> Tensor:
+-        return nn.functional.linear(x,dequantize_mat(self.weight_q,self.max_val),bias=self.bias)
+-
+-# act_max for smooth 
+-class W8X8Linear(nn.Module):
+-    def __init__(self, ori_w:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
+-        super().__init__()
+-        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
+-        self.dtype = ori_w.dtype
+-        self.alpha = alpha
+-        self.scales = None
+-        if act_max is not None:
+-            act_max = act_max.to(ori_w.device)
+-            self.scales = (act_max.pow(alpha) / ori_w.abs().max(dim=0)[0].pow(1 - alpha)).clamp(min=1e-5).to(dtype=ori_w.dtype)
+-            self.scales = nn.Parameter(self.scales,requires_grad=False)
+-            ori_w = ori_w.detach().mul(self.scales)
+-        self.weight_q,self.max_val = quantize_mat(ori_w.detach())
+-        self.weight_q = nn.Parameter(self.weight_q.t(),requires_grad=False)
+-        self.max_val = nn.Parameter(self.max_val,requires_grad=False)
+-
+-    def forward(self,x:Tensor) -> Tensor:
+-        if self.scales is not None:
+-            x = x.div(self.scales)
+-        x_q,x_max = quantize_mat(x)
+-        res = qMatmul(x_q,x_max,self.weight_q,self.max_val,x.dtype)
+-        if self.bias is not None:
+-            res = res + self.bias
+-        return res
+-
+-# static decomposition
+-class W8SDLinear(nn.Module):
+-    def __init__(self, origin_weight:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
+-        super().__init__()
+-        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
+-        self.dtype = origin_weight.dtype
+-        self.alpha = alpha
+-        if act_max is not None:
+-            self.idx_unq,self.t = get_unq_idx_topk(act_max,self.alpha)
+-        else:
+-            self.idx_unq,self.t = get_unq_idx_topk(origin_weight,self.alpha)
+-        self.idx_unq,self.t = self.idx_unq.to(origin_weight.device),self.t.to(origin_weight.device)
+-        self.weight_q,self.weight_unq = decomposition(origin_weight,self.idx_unq,self.t)
+-        self.weight_q,self.w_max = quantize_mat(self.weight_q.detach())
+-        self.weight_q = nn.Parameter(self.weight_q.t(),requires_grad=False)
+-        self.weight_unq = nn.Parameter(self.weight_unq.t(),requires_grad=False)
+-        self.w_max = nn.Parameter(self.w_max,requires_grad=False)
+-        self.t = nn.Parameter(self.t,requires_grad=False)
+-        self.idx_unq = nn.Parameter(self.idx_unq,requires_grad=False)
+-
+-    def forward(self,x:Tensor) -> Tensor:
+-        x_q,x_unq = decomposition(x,self.idx_unq,self.t)
+-        x_q,x_max = quantize_mat(x_q)
+-        res_q = qMatmul(x_q,x_max,self.weight_q,self.w_max,x.dtype)
+-        res_unq = torch.matmul(x_unq, self.weight_unq)
+-        if self.bias is not None:
+-            res_unq += self.bias
+-        return res_q + res_unq
+-    
+-class W8DXLinear(nn.Module):
+-    def __init__(self, origin_weight:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
+-        super().__init__()
+-        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
+-        self.dtype = origin_weight.dtype
+-        self.alpha = alpha
+-        self.weight_q,self.max_val = quantize_mat(origin_weight.detach())
+-        self.weight_q = nn.Parameter(self.weight_q.t(),requires_grad=False)
+-        self.max_val = nn.Parameter(self.max_val,requires_grad=False)
+-
+-    def forward(self,x:Tensor) -> Tensor:
+-        idx_unq,t = get_unq_idx_topk(x,self.alpha)
+-        x_q,x_unq = decomposition(x,idx_unq,t)
+-        x_q,x_max = quantize_mat(x_q)
+-        res_q = qMatmul(x_q,x_max,self.weight_q,self.max_val,x.dtype)
+-        weight_unq= torch.mul(self.weight_q[idx_unq,:],self.max_val.unsqueeze(0))
+-        res_unq = torch.matmul(x_unq, weight_unq)
+-        if self.bias is not None:
+-            res_unq += self.bias
+-        return res_q + res_unq
+-
+-
+-quant_cls = {
+-    "W8":W8Linear,
+-    "W8X8":W8X8Linear,
+-    "W8SD":W8SDLinear,
+-    "W8DX":W8DXLinear
+-}         
+-        
+-def replace_linear_modules(module:nn.Module,prefix:str,act_scales,cfg):
+-    for name, child in module.named_children():
+-        fullname = (prefix + '.' + name) if prefix != '' else name
+-        if isinstance(child, nn.Linear):
+-            strs = fullname.split(".")
+-            # fullname: model.layers.21.self_attn.q_proj layer_name: 21.q_proj; name: q_proj
+-            # fullname: lm_head; layer_name: 21.q_proj; name: q_proj;
+-            layer_name = (strs[-3] + "." + strs[-1]) if len(strs) > 2 else strs[-1]
+-            if layer_name not in cfg:
+-                continue
+-            act_scale = None if act_scales is None or 'act_scale' not in cfg[layer_name] else act_scales[fullname]
+-            alpha = 32 if 'alpha' not in cfg[layer_name] else cfg[layer_name]['alpha']
+-            setattr(module, name,quant_cls[cfg[layer_name]['type']]
+-                    (child.weight,child.bias,act_max=act_scale,alpha=alpha))
+-        else:
+-            replace_linear_modules(child,fullname,act_scales,cfg)
+-
+-def quantize(model:nn.Module,cfg={}):
+-    act_scales = None
+-    if 'act_scales_path' in cfg:
+-        act_scales = torch.load(cfg['act_scales_path'])
+-        if 'smooth' in cfg:
+-            from smooth import smooth_lm
+-            alpha = 0.85 if "alpha" not in cfg else cfg["alpha"]
+-            smooth_lm(model, act_scales, alpha)
+-    replace_linear_modules(model,'',act_scales,cfg)
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/readme.md ascend-llm-qwen/export_llama/readme.md
+--- ascend-llm/export_llama/readme.md	2024-09-04 19:21:03.078081000 +0800
++++ ascend-llm-qwen/export_llama/readme.md	1970-01-01 08:00:00.000000000 +0800
+@@ -1,70 +0,0 @@
+-# Llama 模型导出
+-
+-## 量化
+-
+-主要参考的量化方案有[LLM.int8](https://arxiv.org/abs/2208.07339)和[SmoothQuant](https://arxiv.org/abs/2211.10438)
+-
+-量化需要引入`quantize.py`和config文件下的配置文件，目前量化方式共有四种：int8仅权重量化(W8)，int8全量化(W8X8，传入act_scale可以平滑激活)，静态混合精度分解(SD)和动态混合精度分解(W8DX)。根据Llama模型特点，down_proj比qkv_proj, up_proj, gate_proj更难以量化，更深的Decoder Layer更难以量化。配置文件的格式为
+-```python
+-quantize_cfg = {
+-    "0.q_proj":{ # 字典的key为具体某一层，第几个Decoder Layer+线性层名字
+-        "type":"W8SD", # 量化类型
+-		"act_scale":True, # type=W8X8表示采用平滑激活，type=W8SD表示用生成的act scale进行静态混合精度分解，如果不采用直接不填这一项，判断时只判断是否存在字典中是否存在act_scale的key-value对，不检查值。
+-		"alpha":0.85 # 平滑激活的迁移系数，混合精度分解的将多少特征保留为FP16
+-    },
+-	"act_scales_path":"/root/zanilia/export_llama/act_scales/llama-2-7b.pt",
+-	"smooth":True, # SmoothQuant的方案，将激活值的缩放与RMSNorm融合，不会造成额外的开销，但down_proj层无法使用
+-	"alpha":0.85, #SmoothQuant的迁移系数
+-}
+-```
+-创建新的配置文件方式，新建一个python源文件并提供get函数接口，参数为模型配置和act_scale路径，返回dict格式的量化配置。
+-在config文件夹下，提供了几个常用的量化配置：int8仅权重量化(w8.py)，int8全量化(w8x8.py)，静态混合精度分解(sd.py)，动态混合精度分解(w8dx.py)，平滑激活(smooth.py)，平滑+静态混合精度分解(smsd.py)。
+-
+-## 代码解析
+-
+-### export_llama.py
+-
+-通过`python export_llama.py`导出onnx
+-
+-### modeling_llama.py
+-
+-对llama模型进行修改，主要修改内容
+-1. 只返回新生成的KV缓存（默认返回所有KV缓存），将返回KV缓存有tuple变为Tensor（torch.cat）
+-2. 修改LlamaRotaryEmbedding类，原来的方式采用cache的方式，调用时参数为seq_len，如果返回cache的前seq len个元素（如果不足，则需要再次生成）。修改后，调用每次返回max_position_embeddings个元素。所有元素都提前生成了，seq len参数没有使用，不会再次生成，在导出前应将max_position_embeddings设置大一些。
+-	修改原因主要是：调用LlamaRotaryEmbedding的seq len，为输入长度+kv长度。在apply_rotary_pos_emb，使用position ids为下标取LlamaRotaryEmbedding的输出，获得世纪的PosEmbedding。转om时，这输入长度+kv长度是固定值，如果通过StreamingLLM，H2O等方式驱逐KV缓存，position_ids会超过输入长度+kv长度，导致错误。也可以修改代码获取真实的输入长度+kv长度。
+-
+-### export_llama.py
+-
+-将llama模型导出为onnx文件
+-
+-## quantize.py
+-
+-量化相关代码，总共有四种方法。
+-
+-1. W8Linear: int8仅权重量化
+-2. W8X8Linear: vector-wise absmax int8全量化
+-3. W8SDLinear: 静态的混合精度分解，分解方式可以使用按权重分解和按act max分解（推荐按act max分解）
+-4. W8DXLinear: 动态根据输入进行混合精度分解
+-
+-### smooth.py
+-
+-直接使用[SmoothQuant](https://github.com/mit-han-lab/smoothquant/) 的相关代码，对激活进行平滑，降低量化难度，调用smooth_lm接口进行量化。smoothquant目前对于q_proj,k_proj,v_proj,gate_proj,up_porj进行平滑，对于down_proj的平滑，可以在W8X8Linear参数中传入act max。
+-
+-### generate_act_scales.py
+-
+-直接使用[SmoothQuant](https://github.com/mit-han-lab/smoothquant/) 的相关代码，可以计算某个模型在特定数据集上激活值的最大值，可以用于smoothquant方法的平滑操作和W8SDLinear的混合精度分解。
+-
+-```bash
+-python examples/generate_act_scales.py \
+-    --model-name <model_name_or_path> \
+-    --output-path <output_act_scales_file_path> \
+-    --num-samples <num_samples> \
+-    --seq-len <sequence_length> \
+-    --dataset-path <path_to_the_calibration_dataset>
+-```
+-
+-### change_node.py
+-
+-将cast fp->int8算子转换为AscendQuant算子，用于atc模型转换
+-
+-
+diff -uNr ascend-llm/export_llama/requirements.txt ascend-llm-qwen/export_llama/requirements.txt
+--- ascend-llm/export_llama/requirements.txt	2024-09-04 19:21:03.078081000 +0800
++++ ascend-llm-qwen/export_llama/requirements.txt	1970-01-01 08:00:00.000000000 +0800
+@@ -1,4 +0,0 @@
+-torch
+-transformers==4.35
+-onnx
+-lm-eval==0.4.2 # for eval
+\ No newline at end of file
+diff -uNr ascend-llm/export_llama/smooth.py ascend-llm-qwen/export_llama/smooth.py
+--- ascend-llm/export_llama/smooth.py	2024-09-04 19:21:03.078081000 +0800
++++ ascend-llm-qwen/export_llama/smooth.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,56 +0,0 @@
+-'''
+-code from https://github.com/mit-han-lab/smoothquant/
+-'''
+-import torch
+-import torch.nn as nn
+-
+-from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
+-
+-@torch.no_grad()
+-def smooth_ln_fcs_llama_like(ln, fcs, act_scales, alpha=0.5):
+-    if not isinstance(fcs, list):
+-        fcs = [fcs]
+-    assert isinstance(ln, (LlamaRMSNorm,nn.Linear))
+-    for fc in fcs:
+-        assert isinstance(fc, nn.Linear)
+-        assert ln.weight.shape[0] == fc.in_features == act_scales.numel()
+-    device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
+-    act_scales = act_scales.to(device=device, dtype=dtype)
+-    weight_scales = torch.cat(
+-        [fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0
+-    )
+-    weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)
+-    scales = (
+-        (act_scales.pow(alpha) / weight_scales.pow(1 - alpha))
+-        .clamp(min=1e-5)
+-        .to(device)
+-        .to(dtype)
+-    )
+-    if ln.weight.dim() == 2:
+-        ln.weight.div_(scales.unsqueeze(-1))
+-    else:
+-        ln.weight.div_(scales)
+-    for fc in fcs:
+-        fc.weight.mul_(scales.view(1, -1))
+-
+-
+-@torch.no_grad()
+-def smooth_lm(model, scales, alpha=0.5):
+-    for name, module in model.named_modules():
+-        if isinstance(module, LlamaDecoderLayer):
+-            attn_ln = module.input_layernorm  # attention forward norm
+-            qkv = [
+-                module.self_attn.q_proj,
+-                module.self_attn.k_proj,
+-                module.self_attn.v_proj,
+-            ]
+-
+-            qkv_input_scales = scales[name + ".self_attn.q_proj"]
+-            smooth_ln_fcs_llama_like(attn_ln, qkv, qkv_input_scales, alpha)
+-
+-            ffn_ln = module.post_attention_layernorm  # feed forward norm
+-            fcs = [module.mlp.gate_proj, module.mlp.up_proj]
+-            fcs_input_scales = scales[name + ".mlp.gate_proj"]
+-
+-            smooth_ln_fcs_llama_like(ffn_ln, fcs, fcs_input_scales, alpha)
+-            # smooth_ln_fcs_llama_like(module.mlp.up_proj,module.mlp.down_proj,scales[name + ".mlp.down_proj"],0.9)
+\ No newline at end of file
+diff -uNr ascend-llm/inference/config.py ascend-llm-qwen/inference/config.py
+--- ascend-llm/inference/config.py	2024-09-04 19:21:03.079083400 +0800
++++ ascend-llm-qwen/inference/config.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,64 +0,0 @@
+-from dataclasses import dataclass,field
+-from typing import Optional,Union,List,Dict
+-import os
+-import json
+-
+-@dataclass
+-class InferenceConfig:
+-    tokenizer: str = ""
+-    hf_model_dir:str = "" # huggingface 模型目录，包含tokenizer和config.json
+-    sampling_method: str = "top_k" # "greedy" | "top_k" | "top_p"
+-    sampling_value: float = 10 # k for top_k p for top_p
+-    temperature: float = 0.7
+-    max_length:int = 512 # 输出长度的最大值
+-    max_input_len:int = 1 # 每次推理输入的最大长度为max_input_len，对om目前设置为1
+-    session_type:str="acl" # onnx或者acl
+-    acl_mode="rc" #rc模式下host和device是同一块内存，可以对执行流程进行优化
+-    device:int=0
+-    # prompt:List[Dict[str,str]] = field(default_factory=list)
+-    prompt:List[Dict[str,str]] = field(default_factory=lambda: [
+-        {"role":"user","content":"Hey there I am a human that would like to have a conversation with you."},
+-        {"role":"assistant","content":"Sure, I am happy to answer your questions"},
+-        {"role":"user","content":"Great, I insist that we take turns."},
+-        {"role":"assistant","content":"I agree, we should take turns."},
+-    ])
+-    model:str=""
+-    kvcache_method:str = "sliding-window" # "basic"|"sliding-window"|'streamllm'|'H2O'
+-    kvcache_fixsize:bool = True # 输入的kv缓存是否固定shape
+-    head_len:int= 32 # 在KVCache evict时前head len会被保留
+-    recent_len:int = 32 # 在KVCache evict时最近recent len会被保留
+-    evict_len:int = 64 # KVCache 逐出的最小值，当KVCache达到最大值时将逐出evict_len个KVCache
+-    n_layer:int = 22
+-    format:str='huggingface-tensor' #KVcache的格式
+-    max_cache_size:int=256 # kvcache的最大长度
+-    head_num:int=4
+-    num_kv_group:int = 8 # for GQA
+-    head_dim:int=64
+-    hidden_dim:int=2048
+-    dtype:str="float16"
+-    model_type:str="llama-2-7b"
+-    
+-    def __post_init__(self):
+-        assert(self.kvcache_method in ["basic","sliding-window",'streamllm','H2O'])
+-        assert(os.path.isdir(self.hf_model_dir))
+-        assert(self.session_type in ["acl","onnx"])
+-        if self.session_type == "onnx":
+-            self.max_input_len = self.max_length
+-        self.evict_len = int(min((self.max_cache_size - self.head_len )/2,self.evict_len ))
+-        self.max_input_len = int(min(self.max_input_len,self.evict_len))
+-        self.tokenizer = self.hf_model_dir
+-        model_desc = None
+-        with open(self.hf_model_dir+"/config.json") as f:
+-            model_desc = json.load(f)
+-        self.n_layer = model_desc['num_hidden_layers']
+-        self.head_num = model_desc['num_key_value_heads']
+-        self.num_kv_group = int(model_desc['num_attention_heads'] / self.head_num)
+-        self.hidden_dim = model_desc["hidden_size"]
+-        self.head_dim = int(self.hidden_dim / model_desc['num_attention_heads'])
+-        if self.hidden_dim == 2048:
+-            self.model_type = "tiny-llama"
+-        if self.kvcache_method == "streamllm":
+-            assert(self.head_len+self.evict_len < self.max_cache_size)
+-        if self.kvcache_method == "H2O":
+-            self.evict_len = int(min((self.max_cache_size - self.head_len -self.recent_len )/2,self.evict_len ))
+-            assert(self.head_len+self.recent_len+self.evict_len < self.max_cache_size)
+diff -uNr ascend-llm/inference/engine.py ascend-llm-qwen/inference/engine.py
+--- ascend-llm/inference/engine.py	2024-09-04 19:21:03.079083400 +0800
++++ ascend-llm-qwen/inference/engine.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,242 +0,0 @@
+-import time
+-from typing import Dict, List
+-import acl
+-import numpy as np
+-import os
+-import ctypes
+-from ctypes import c_void_p, c_int, c_size_t, c_ulong, c_int64,POINTER
+-ACL_MEM_MALLOC_HUGE_FIRST = 0
+-ACL_MEMCPY_HOST_TO_DEVICE = 1
+-ACL_MEMCPY_DEVICE_TO_HOST = 2
+-ACL_MEM_MALLOC_NORMAL_ONLY = 2
+-
+-libc = ctypes.CDLL("libc.so.6")
+-# mmap函数原型
+-mmap_func = libc.mmap
+-mmap_func.argtypes = [c_void_p, c_size_t, c_int, c_int, c_int, c_int64]
+-mmap_func.restype = c_void_p
+-
+-# munmap函数原型
+-munmap_func = libc.munmap
+-munmap_func.argtypes = [c_void_p, c_size_t]
+-munmap_func.restype = c_int
+-def mmap_file(file_path):  
+-    # 打开文件并获取文件描述符  
+-    file_descriptor = os.open(file_path, os.O_RDONLY)  
+-    file_size = os.lseek(file_descriptor, 0, os.SEEK_END)  
+-    os.lseek(file_descriptor, 0, os.SEEK_SET)  
+-    # 调用mmap映射文件到内存  
+-    # PROT_READ和MAP_PRIVATE的值可能因系统而异，这里假设为1和2  
+-    protection_flags = 1  # PROT_READ  
+-    visibility_flags = 2  # MAP_PRIVATE  
+-    mapped_memory = mmap_func(None, file_size, protection_flags, visibility_flags, file_descriptor, 0)    
+-    if mapped_memory == -1:  
+-        raise Exception("Error mapping the file.")  
+-
+-    # 关闭文件描述符，映射区域仍然有效  
+-    os.close(file_descriptor)  
+-    
+-    # 返回映射区域的地址  
+-    return mapped_memory,file_size
+-def check_ret(str,ret):
+-    if ret != 0:
+-        print(f"return code is {ret}, detail: {str}",flush=True) 
+-
+-def initResource(device):
+-    ret = acl.init()
+-    check_ret("init", ret)
+-    ret = acl.rt.set_device(device)
+-    check_ret("set_device", ret)
+-    context,ret = acl.rt.create_context(device)
+-    check_ret("create_context", ret)
+-    return context
+-
+-def destroyResource(device,context):
+-    ret = acl.rt.reset_device(device)
+-    ret = acl.finalize()
+-    ret = acl.rt.destroy_context(context)
+-
+-ACL_FLOAT,ACL_FLOAT16,ACL_INT8,ACL_INT32,ACL_INT64 = 0,1,2,3,9
+-NPY_FLOAT32,NPY_FLOAT16,NPY_INT8,NPY_INT32,NPY_INT64 = 11,23,1,5,7
+-dtype2NpType = {ACL_FLOAT:np.float32,ACL_FLOAT16:np.float16,ACL_INT8:np.int8,ACL_INT32:np.int32,ACL_INT64:np.int64}
+-dtypeMp = {ACL_FLOAT:NPY_FLOAT32,ACL_FLOAT16:NPY_FLOAT16,ACL_INT8:NPY_INT8,ACL_INT32:NPY_INT32,ACL_INT64:NPY_INT64}
+-class ACLModel:
+-    def __init__(self,model_path,mode="rc",context=None,callback=None):
+-        self.context = context
+-        self.model_id = None
+-        self.model_desc = None
+-        self.callback_func = callback 
+-        self.tid = None
+-        self.stream = None
+-        self.callback_interval = 1
+-        self.exit_flag = False
+-        self.mode = mode
+-        self.input_dataset, self.output_dataset = None, None
+-        self.inputs:List[Dict[str,]] = []
+-        self.outputs:List[Dict[str,]] =  []
+-        self.in_arrs:List[np.ndarray] = []
+-        self.out_arrs:List[np.ndarray] = []
+-        self.loadModel(model_path)
+-        self.allocateMem()
+-        if not callback:
+-            return
+-        self.stream, ret = acl.rt.create_stream()
+-        self.tid, ret = acl.util.start_thread(self._process_callback,
+-                                         [self.context, 50])
+-        check_ret("acl.util.start_thread", ret)
+-        ret = acl.rt.subscribe_report(self.tid, self.stream)
+-        check_ret("acl.rt.subscribe_report", ret)
+-    
+-    def unload(self):
+-        if self.callback_func:
+-            ret = acl.rt.synchronize_stream(self.stream)
+-            # 2.7 取消线程注册，Stream上的回调函数不再由指定线程处理。
+-            ret = acl.rt.unsubscribe_report(self.tid, self.stream)
+-            self.exit_flag = True
+-            ret = acl.util.stop_thread(self.tid)
+-            ret = acl.rt.destroy_stream(self.stream)
+-        self.freeMem()
+-        self.unloadModel()  
+-
+-    def loadModel(self, model_path):
+-        '''
+-        model_size = os.path.getsize(model_path)
+-        
+-        work_size, weight_size, ret = acl.mdl.query_size(model_path)
+-        weight_size = max(model_size,weight_size)
+-        work_ptr, ret= acl.rt.malloc_host(work_size)
+-        model = acl.rt.malloc_host(weight_size)
+-        with open(model_path, 'rb') as file:
+-            model = file.read()
+-        self.model_id, ret = acl.mdl.load_from_mem_with_mem(id(model), weight_size, work_ptr, work_size, id(model), weight_size)
+-        '''
+-        model_add, model_size = mmap_file(model_path)
+-        self.model_id, ret = acl.mdl.load_from_mem(model_add, model_size)
+-        
+-        #self.model_id, ret = acl.mdl.load_from_file(model_path)
+-        check_ret("load model",ret)
+-        munmap_func(model_add, model_size)
+-        self.model_desc = acl.mdl.create_desc()
+-        ret = acl.mdl.get_desc(self.model_desc, self.model_id)
+-        check_ret("get model desc",ret)
+-    
+-    def unloadModel(self):
+-        ret = acl.mdl.unload(self.model_id)
+-        if self.model_desc:
+-            ret  = acl.mdl.destroy_desc(self.model_desc)
+-            self.model_desc = None
+-
+-    def allocateMem(self):
+-        self.input_dataset = acl.mdl.create_dataset()
+-        input_size = acl.mdl.get_num_inputs(self.model_desc)
+-        for i in range(input_size):
+-            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
+-            buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
+-            check_ret("alloc input memory",ret)
+-            data = acl.create_data_buffer(buffer, buffer_size)
+-            _, ret = acl.mdl.add_dataset_buffer(self.input_dataset, data)
+-            check_ret("add_dataset_buffer",ret)
+-            dims, ret = acl.mdl.get_input_dims(self.model_desc, i)
+-            self.inputs.append({"buffer": buffer, "size": buffer_size})
+-            if self.mode == 'rc':
+-                data_type = acl.mdl.get_input_data_type(self.model_desc, i)
+-                self.in_arrs.append(acl.util.ptr_to_numpy(buffer,tuple(dims['dims']),dtypeMp[data_type]))
+-
+-        self.output_dataset = acl.mdl.create_dataset()
+-        output_size = acl.mdl.get_num_outputs(self.model_desc)
+-        buffer_host = None
+-        for i in range(output_size):
+-            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
+-            data_type = acl.mdl.get_output_data_type(self.model_desc, i)
+-            buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
+-            check_ret("alloc output memory",ret)
+-            data = acl.create_data_buffer(buffer, buffer_size)
+-            _, ret = acl.mdl.add_dataset_buffer(self.output_dataset, data)
+-            check_ret("add_dataset_buffer",ret)
+-            dims, ret = acl.mdl.get_output_dims(self.model_desc, i)
+-            if self.mode == 'rc':
+-                self.out_arrs.append(acl.util.ptr_to_numpy(buffer,tuple(dims['dims']),dtypeMp[data_type]))
+-            else:
+-                buffer_host, ret = acl.rt.malloc_host(buffer_size)
+-                check_ret("alloc output host memory",ret)
+-            self.outputs.append({"buffer": buffer, "size": buffer_size,'buffer_host':buffer_host,'dtype':dtype2NpType[data_type]})
+-
+-    def freeMem(self):
+-        for item in self.input_data:
+-            ret = acl.rt.free(item["buffer"])
+-        ret = acl.mdl.destroy_dataset(self.input_dataset)
+-        for item in self.output_data:
+-            ret = acl.rt.free(item["buffer"])
+-            if self.mode != 'rc':
+-                ret = acl.rt.free_host(item["buffer_host"])
+-        ret = acl.mdl.destroy_dataset(self.output_dataset)
+-        
+-    def getInputs(self):
+-        return self.in_arrs # 获取输入np数组，可以直接修改
+-
+-    def inference(self,datas) -> List[np.ndarray]:
+-        acl.rt.set_context(self.context)
+-        if self.mode == 'rc':
+-            for i,data in enumerate(datas):
+-                self.in_arrs[i][:] = data[:] # 如果输入的np数组和in_arrs中是一个数组则不会发生拷贝
+-        else:
+-            for i,data in enumerate(datas):
+-                bytes_data = data.tobytes()
+-                np_ptr = acl.util.bytes_to_ptr(bytes_data)
+-                ret = acl.rt.memcpy(self.inputs[i]["buffer"], self.inputs[i]["size"], np_ptr,self.inputs[i]["size"], ACL_MEMCPY_HOST_TO_DEVICE)
+-                check_ret("memcpy", ret)
+-        ret = acl.mdl.execute(self.model_id, self.input_dataset,self.output_dataset)
+-        check_ret("execute", ret)
+-        if self.mode == 'rc':
+-            return self.out_arrs
+-        inference_result = []
+-        for idx,out in enumerate(self.outputs):
+-            ret = acl.rt.memcpy(out['buffer_host'], out["size"],out["buffer"],out["size"],ACL_MEMCPY_DEVICE_TO_HOST)
+-            bytes_out = acl.util.ptr_to_bytes(out['buffer_host'], out["size"])
+-            dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, idx)
+-            out_data = np.frombuffer(bytes_out, dtype=out['dtype']).reshape(dims['dims'])
+-            inference_result.append(out_data)
+-        return inference_result
+-    
+-    def inference_async(self,datas,other_args) -> List[np.ndarray]:
+-        acl.rt.set_context(self.context)
+-        if self.mode == 'rc':
+-            for i,data in enumerate(datas):
+-                self.in_arrs[i][:] = data[:]
+-        else:
+-            for i,data in enumerate(datas):
+-                np_ptr = acl.util.bytes_to_ptr(data.tobytes())
+-                ret = acl.rt.memcpy(self.inputs[i]["buffer"], self.inputs[i]["size"], np_ptr,self.inputs[i]["size"], ACL_MEMCPY_HOST_TO_DEVICE)
+-                check_ret("memcpy", ret)
+-        ret = acl.mdl.execute_async(self.model_id, self.input_dataset,self.output_dataset,self.stream)
+-        check_ret("exec_async", ret)
+-        print(f"submit exec task {other_args[1]}")
+-        ret = acl.rt.launch_callback(self.callPostProcess,other_args,1,self.stream)
+-        check_ret("launch callback", ret)
+-
+-    def _process_callback(self, args_list):
+-        context, timeout = args_list
+-        acl.rt.set_context(context)
+-        while self.callback_interval:
+-            acl.rt.process_report(timeout)
+-            if self.exit_flag:
+-                print("[Callback] exit acl.rt.process_report")
+-                break
+-
+-    def callPostProcess(self,other_args):
+-        print("start callback",flush=True)
+-        time1 = time.time()
+-        inference_result = []
+-        if self.mode == 'rc':
+-            inference_result = self.out_arrs
+-        else:
+-            for idx,out in enumerate(self.outputs):
+-                ret = acl.rt.memcpy(out['buffer_host'], out["size"],out["buffer"],out["size"],ACL_MEMCPY_DEVICE_TO_HOST)
+-                bytes_out = acl.util.ptr_to_bytes(out['buffer_host'], out["size"])
+-                dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, idx)
+-                data = np.frombuffer(bytes_out, dtype=out['dtype']).reshape(dims['dims'])
+-                inference_result.append(data)
+-        if not self.callback_func:
+-            return
+-        self.callback_func(inference_result,other_args)
+-        print(f"end callback, use time: {time.time()-time1}")
+\ No newline at end of file
+diff -uNr ascend-llm/inference/inference.py ascend-llm-qwen/inference/inference.py
+--- ascend-llm/inference/inference.py	2024-09-04 19:21:03.079083400 +0800
++++ ascend-llm-qwen/inference/inference.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,154 +0,0 @@
+-import numpy as np
+-import os
+-from typing import Any, Generator, List,Tuple,Dict
+-from threading import Lock
+-from session import Session
+-from config import InferenceConfig
+-
+-class LlamaInterface:
+-    def __init__(self,config:InferenceConfig) -> None:
+-        self.max_length = config.max_length
+-        from transformers import AutoTokenizer
+-        self.tokenizer=AutoTokenizer.from_pretrained(config.tokenizer)
+-        self.sampling_method=config.sampling_method
+-        self.sampling_value = config.sampling_value
+-        self.temperature=config.temperature
+-        self.session=Session.fromConfig(config)
+-        self.prompt=config.prompt
+-        self.state:dict[str,Any] = {"code":200,"isEnd":False,"message":""}        
+-        self.first=True
+-        self.stop_mp = {"[|Human|]":6,"[|AI|]":5,"<|assistant|>":6,"<|user|>":5,"<|system|>":5}
+-        self.stop_words = ["<|user|>","<|assistant|>","<|system|>","[|AI|]","[|Human|]"]
+-        self.model_type = config.model_type
+-        self.last_output=""
+-        self.lock = Lock()
+-        self.reset()
+-        print("init success")
+-
+-    def generate_cache(self,prompt:str):
+-        if len(prompt) == 0 :
+-            return
+-        input_ids = np.asarray(self.encode(prompt,add_bos_token=self.first),dtype=np.int64).reshape(1,-1)
+-        self.first=False
+-        logits = self.session.run(input_ids)[0]
+-        return self.sample_logits(logits[0][-1:],self.sampling_method,self.sampling_value,self.temperature),logits
+-
+-    def sample_logits(
+-        self,
+-        logits: np.ndarray,
+-        sampling_method: str = "greedy",
+-        sampling_value: float = None,
+-        temperature: float = 1.0,
+-    ) -> np.ndarray:
+-        if temperature == 0 or sampling_method == "greedy":
+-            next_token = np.argmax(logits, axis=-1).astype(np.int64)
+-
+-        elif sampling_method == "top_k" or sampling_method == "top_p":
+-            assert sampling_value is not None
+-            logits = logits.astype(np.float32)
+-            logits /= temperature
+-            probs = np.exp(logits) / np.sum(np.exp(logits))
+-            sorted_probs = np.sort(probs)[:, ::-1]
+-            sorted_indices = np.argsort(probs)[:, ::-1]
+-
+-            if sampling_method == "top_k":
+-                index_of_interest = int(sampling_value)
+-            elif sampling_method == "top_p":
+-                p = sampling_value
+-                cumulative_probs = np.cumsum(sorted_probs, axis=-1)
+-                for index_of_interest, cumulative_prob in enumerate(
+-                    cumulative_probs[0]
+-                ):
+-                    if cumulative_prob > p:
+-                        break
+-
+-            probs_of_interest = sorted_probs[:, : index_of_interest + 1]
+-            indices_of_interest = sorted_indices[:, : index_of_interest + 1]
+-            probs_of_interest /= np.sum(probs_of_interest)
+-            next_token = np.array(
+-                [np.random.choice(indices_of_interest[0], p=probs_of_interest[0])]
+-            )
+-        else:
+-            raise Exception(f"Unknown sampling method {sampling_method}")
+-
+-        return next_token
+-
+-    
+-    def format_last_output(self):
+-        if len(self.last_output) == 0:
+-            return 
+-        text_format = self.apply_chat_template([{"role":"assistant","content":self.last_output}])
+-        self.generate_cache(text_format[len(self.last_output):])
+-        self.last_output = ""
+-    
+-    def predict(self, text):
+-        with self.lock:
+-            self.state['isEnd'],self.state['message'] = False,""        
+-        if text == "":
+-            return
+-        self.format_last_output()
+-        text = self.apply_chat_template([{"role":"user","content":text}])
+-        input_ids = self.encode(text,add_bos_token=self.first)
+-        input_ids = np.asarray(input_ids,dtype=np.int64).reshape(1,-1)
+-        self.first,ids_list = False,[]
+-        for i in range(self.max_length):
+-            logits = self.session.run(input_ids)[0]
+-            input_ids = self.sample_logits(logits[0][-1:], self.sampling_method, self.sampling_value, self.temperature)
+-            input_ids = input_ids.reshape(1, -1)
+-            if input_ids[0] == self.tokenizer.eos_token_id:
+-                self.session.rollback(1) 
+-                break
+-            ids_list.append(input_ids[0].item())
+-            text_out = self.tokenizer.decode(ids_list)
+-            stop_word = is_stop_word_or_prefix(text_out,self.stop_words)
+-            if stop_word != "":
+-                ids_list = ids_list[:-self.stop_mp[stop_word]]
+-                self.session.rollback(self.stop_mp[stop_word]) 
+-                break
+-            if i%3 == 0:
+-                with self.lock:
+-                    self.state['message']=text_out
+-        self.last_output = self.tokenizer.decode(ids_list)
+-        with self.lock:
+-            self.state['message'],self.state['isEnd'] = self.last_output,True
+-        return self.last_output
+-
+-    def reset(self):
+-        self.first = True
+-        self.last_output = ""
+-        self.session.reset()
+-        self.generate_cache(self.apply_chat_template(self.prompt))
+-        
+-    def getState(self):
+-        with self.lock:
+-            return self.state.copy()
+-
+-    def apply_chat_template(self,messages:List[Dict[str,str]]) -> str:
+-        text = ""
+-        if self.model_type == "llama-2-7b":
+-            for message in messages:
+-                if message["role"] == "user":
+-                    text += f'[|Human|]\n{message["content"]}\n[|AI|]'
+-                elif message["role"] == "system":
+-                    text += f'[|System|]\n{message["content"]}\n'
+-                else:
+-                    text += f'{message["content"]}\n'
+-        elif self.model_type == "tiny-llama":
+-            for message in messages:
+-                if message["role"] == "user":
+-                    text += f'<|user|>\n{message["content"]}</s>\n<|assistant|>'
+-                elif message["role"] == "system":
+-                    text += f'<|system|>\n{message["content"]}</s>\n'
+-                else:
+-                    text += f'{message["content"]}</s>\n'
+-        return text
+-    
+-    def encode(self,text,add_bos_token=False):
+-        self.tokenizer.add_bos_token = add_bos_token
+-        return self.tokenizer.encode(text)
+-
+-def is_stop_word_or_prefix(s: str, stop_words: list) -> int:
+-    for stop_word in stop_words:
+-        if s.endswith(stop_word):
+-            return stop_word
+-    return ""
+\ No newline at end of file
+diff -uNr ascend-llm/inference/kvcache.py ascend-llm-qwen/inference/kvcache.py
+--- ascend-llm/inference/kvcache.py	2024-09-04 19:21:03.079083400 +0800
++++ ascend-llm-qwen/inference/kvcache.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,199 +0,0 @@
+-import numpy as np
+-from typing import Optional,Tuple,List
+-from config import InferenceConfig
+-# 对KV缓存和输出输出格式进行管理
+-class KVCache:
+-    def __init__(self,cfg:InferenceConfig) -> None:
+-        self.head_len = cfg.head_len
+-        self.max_size = cfg.max_cache_size
+-        self.input_pos =  0
+-        self.kv_size = 0
+-        self.n_layer = cfg.n_layer
+-        self.kvCache = None
+-        self.format=cfg.format
+-        self.head_num=cfg.head_num
+-        self.head_dim=cfg.head_dim
+-        self.dtype=np.float16
+-        self.fix_size = cfg.kvcache_fixsize
+-        self.evict_len = cfg.evict_len
+-        self.recent_len = cfg.recent_len
+-        self.num_kv_group = cfg.num_kv_group
+-        if cfg.dtype == "float16":
+-            self.dtype=np.float16
+-        elif cfg.dtype=="float32":
+-            self.dtype=np.float32
+-        self.createEmptyCache()
+-
+-    def createEmptyCache(self):
+-        if self.format == "huggingface-tensor":
+-            self.kvCache=np.zeros((self.n_layer,2,1,self.head_num,self.max_size,self.head_dim),dtype=self.dtype)
+-        elif self.format == "huggingface-list":
+-            self.kvCache=[]
+-            for i in range(self.n_layer):
+-                self.kvCache.append([np.zeros((1,self.head_num,self.max_size,self.head_dim),dtype=self.dtype),np.zeros((1,self.head_num,self.max_size,self.head_dim),dtype=self.dtype)])
+-        elif self.format == 'seq_nhead_headdim':
+-            self.kvCache = [np.zeros((1,self.n_layer,self.max_size,self.head_num,self.head_dim),dtype=self.dtype),np.zeros((1,self.n_layer,self.max_size,self.head_num,self.head_dim),dtype=self.dtype)]
+-        elif self.format == 'nhead_seq_headdim':
+-            self.kvCache = [np.zeros((1,self.n_layer,self.head_num,self.max_size,self.head_dim),dtype=self.dtype),np.zeros((1,self.n_layer,self.head_num,self.max_size,self.head_dim),dtype=self.dtype)]
+-
+-    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],scores:Optional[np.ndarray]=None)->None:
+-        pass
+-    
+-    def evict(self,space_need:int):
+-        pass
+-
+-    def getInputs(self, seq_len: int) -> List[np.ndarray]:
+-        cache,mask = None,None
+-        if self.fix_size:
+-            cache,mask = self.kvCache, np.ones((1,self.max_size+seq_len),dtype=np.int64)
+-            mask[:,self.kv_size:self.max_size] = 0
+-        else:
+-            cache,mask = self.kvCache[:,:,:,:,:self.kv_size], np.ones((1,self.kv_size+seq_len),dtype=np.int64)
+-        pos_id =np.arange(self.input_pos,self.input_pos+seq_len,dtype=np.int64).reshape(1,-1)
+-        return cache,mask,pos_id
+-    
+-    def reset(self):
+-        self.input_pos=0
+-        self.kv_size=0
+-    
+-    def rollback(self,seq_len):
+-        self.kv_size -=seq_len
+-
+-    @staticmethod
+-    def create(config:InferenceConfig) -> 'KVCache':
+-        if config.kvcache_method == "basic":
+-            return Basic(config)
+-        elif config.kvcache_method == "sliding-window":
+-            return SWindow(config)
+-        elif config.kvcache_method == 'streamllm':
+-            return StreamLLM(config)
+-        elif config.kvcache_method == 'H2O':
+-            return H2O(config)
+-        else:
+-            return None
+-
+-class Basic(KVCache):
+-    def __init__(self, cfg: InferenceConfig) -> None:
+-        super().__init__(cfg)
+-        
+-    def update(self, seq_len: int, newKV: Tuple[List[np.ndarray]], scores: Optional[np.ndarray] = None) -> None:
+-        if seq_len + self.kv_size > self.max_size:
+-            raise RuntimeError("超出KV缓存长度限制")
+-        if self.format=="huggingface-tensor":
+-            self.kvCache[:,:,:,:,self.kv_size:self.kv_size+seq_len,:] = newKV[:,:,:,:,0:seq_len,:]
+-        self.kv_size += seq_len
+-        self.input_pos+=seq_len
+-
+-class SWindow(KVCache):
+-    def __init__(self,cfg:InferenceConfig) -> None:
+-        super().__init__(cfg)
+-        self.p=0
+-        self.cnt = 0
+-
+-    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],score:Optional[np.ndarray] = None):
+-        self.input_pos+=seq_len
+-        cur = 0
+-        while self.p + seq_len  > self.max_size:
+-            self.update_part(newKV,cur,self.max_size-self.p)
+-            cur += (self.max_size-self.p)
+-            seq_len -= (self.max_size-self.p)
+-            self.p = self.head_len
+-            self.kv_size = self.max_size
+-            self.cnt += 1
+-        self.update_part(newKV,cur,seq_len)
+-        self.p += seq_len
+-        self.kv_size = max(self.p,self.kv_size)
+-
+-    def update_part(self,newKV:Tuple[List[np.ndarray],List[np.ndarray]],begin:int,len:int):
+-        if len == 0:
+-            return
+-        if self.format == 'huggingface-tensor': #[n_layer,2,batch_size,head_num,len,head_dim]
+-            self.kvCache[:,:,:,:,self.p:self.p+len,:] = newKV[:,:,:,:,begin:begin+len,:]	
+-        if self.format=='seq_nhead_headdim': # [batch, n_layers, seq_len, n_heads, head_dim]
+-            self.kvCache[0][:,:,self.p:self.p+len] = newKV[0][:,:,begin:begin+len]
+-            self.kvCache[1][:,:,self.p:self.p+len] = newKV[1][:,:,begin:begin+len]
+-        elif self.format=='nhead_seq_headdim':    # [batch, n_layers, n_heads, seq_len, head_dim]
+-            self.kvCache[0][:,:,:,self.p:self.p+len] = newKV[0][:,:,:,begin:begin+len]
+-            self.kvCache[1][:,:,:,self.p:self.p+len] = newKV[1][:,:,:,begin:begin+len]
+-        elif self.format=='huggingface-list': # (n_layer,2) * [batch_size,head_num,len,head_dim]
+-            for i in range(self.n_layer):
+-                self.kvCache[i][0][:,:,self.p:self.p+len,:] = newKV[i][0][:,:,begin:begin+len,:]	
+-                self.kvCache[i][1][:,:,self.p:self.p+len,:] = newKV[i][1][:,:,begin:begin+len,:]
+-    
+-    def reset(self):
+-        self.p=0
+-        return super().reset()
+-    
+-    def rollback(self, seq_len):
+-        if self.cnt != 0:
+-            self.p -= seq_len
+-            self.cnt -= 1
+-            if self.p < self.head_len:
+-                self.p = self.max_size - (self.head_len - self.p) + 1
+-            if self.cnt == 0:
+-                self.kv_size = self.p
+-        else:
+-            self.p -= seq_len
+-            self.kv_size -= seq_len
+-
+-class StreamLLM(KVCache):
+-    def __init__(self,cfg:InferenceConfig):
+-        super().__init__(cfg)
+-    
+-    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],score:Optional[np.ndarray] = None):
+-        if self.kv_size + seq_len >= self.max_size:
+-            self.evict(self.evict_len)
+-        self.input_pos += seq_len
+-        self.kvCache[:,:,:,:,self.kv_size:self.kv_size+seq_len] = newKV
+-        self.kv_size += seq_len
+-
+-    def evict(self, space_need: int):
+-        self.kvCache[:,:,:,:,self.head_len:self.kv_size -space_need] = \
+-            self.kvCache[:,:,:,:,self.head_len+space_need:self.kv_size]
+-        self.kv_size -= space_need
+-
+-class H2O(KVCache):
+-    def __init__(self,cfg:InferenceConfig) -> None:
+-        super().__init__(cfg)
+-        self.scores = np.zeros((self.n_layer,1,self.head_num,self.max_size),dtype=self.dtype)
+-        self.idx_head = np.arange(0,self.head_num,dtype=np.int32).reshape(-1,1)
+-    
+-    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],score:Optional[np.ndarray] = None):
+-        # score [n_layer,batch,nheader,input_len,all_len]
+-        if self.num_kv_group != 1:
+-            score = score.reshape(self.n_layer,1,self.num_kv_group,self.head_num,seq_len,-1).sum(axis=2)
+-        elif not score.flags.writeable:
+-            score = score.copy() # acl 返回的ndarray不可写
+-        score[:,:,:,:,self.kv_size:self.kv_size+seq_len] = score[:,:,:,:,-seq_len:]
+-        if self.kv_size + seq_len > self.max_size:
+-            self.o_score = score
+-            self.evict(self.evict_len)
+-            self.o_score,score = None,self.o_score
+-        self.input_pos += seq_len
+-        self.kvCache[:,:,:,:,self.kv_size:self.kv_size+seq_len] = newKV
+-        for i in range(seq_len):
+-            self.update_score_one(score[:,:,:,i])
+-                
+-    def update_score_one(self,score:Optional[np.ndarray] = None):
+-        self.kv_size += 1
+-        self.scores[:,:,:,:self.kv_size] = self.scores[:,:,:,:self.kv_size] * 0.5 + score[:,:,:,:self.kv_size]
+-        
+-    def evict(self, space_need):
+-        r_len = self.kv_size - space_need - self.head_len -self.recent_len # 对前head len个KV缓存进行保留
+-        new_seq = self.o_score.shape[-2]
+-        for i in range(self.n_layer):
+-            idx=np.argpartition(-self.scores[i,0,:,self.head_len:self.kv_size-self.recent_len],r_len,axis=-1)[:,:r_len]
+-            for j in range(2):
+-                self.kvCache[i,j,0,:,self.head_len:self.head_len+r_len] = self.kvCache[i,j,0,self.idx_head,idx]
+-                self.kvCache[i,j,0,:,self.head_len+r_len:self.kv_size-space_need] = \
+-                    self.kvCache[i,j,0,:,self.kv_size-self.recent_len:self.kv_size]
+-            self.scores[i,0,:,self.head_len:r_len+self.head_len] = self.scores[i,0,self.idx_head,idx]
+-            self.scores[i,0,:,self.head_len+r_len:self.kv_size-space_need] = \
+-                self.scores[i,0,:,self.kv_size-self.recent_len:self.kv_size]
+-            for j in range(new_seq):
+-                self.o_score[i,0,:,j,self.head_len:r_len+self.head_len] = self.o_score[i,0,self.idx_head,j,idx]
+-                self.o_score[i,0,:,j,self.head_len+r_len:self.kv_size+new_seq-space_need] = \
+-                    self.o_score[i,0,:,j,self.kv_size-self.recent_len:self.kv_size+new_seq]
+-            self.scores[i,0,:,r_len+self.head_len+self.recent_len:] =  0
+-        self.kv_size = r_len + self.head_len + self.recent_len
+-        # self.head_len + r_len + self.recent_len + new_seq
+diff -uNr ascend-llm/inference/main.py ascend-llm-qwen/inference/main.py
+--- ascend-llm/inference/main.py	2024-09-04 19:21:03.080078900 +0800
++++ ascend-llm-qwen/inference/main.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,94 +0,0 @@
+-import argparse
+-import sys
+-from concurrent.futures import ThreadPoolExecutor
+-from config import InferenceConfig
+-from inference import LlamaInterface
+-
+-def main(cli:bool,engine:LlamaInterface):
+-    if cli:
+-        while True:
+-            line = input()
+-            print(engine.predict(line))
+-    from flask import Flask, request, jsonify
+-    from flask import render_template  # 引入模板插件
+-    from flask_cors import CORS
+-    pool = ThreadPoolExecutor(max_workers=2)        
+-    app = Flask(
+-        __name__,
+-        static_folder='./dist',  # 设置静态文件夹目录
+-        template_folder="./dist",
+-        static_url_path=""
+-    )
+-
+-    CORS(app, resources=r'/*')
+-    
+-    @app.route('/')
+-    def index():
+-        return render_template('index.html', name='index')
+-
+-    @app.route("/api/chat", methods=["POST"])
+-    def getChat():
+-        msg = request.get_json(force=True)['message']
+-        if len(msg) == 0:
+-            return jsonify({"code": 404})
+-        pool.submit(engine.predict,msg)
+-        return jsonify({"code": 200})
+-
+-    @app.route("/api/getMsg", methods=["GET"])
+-    def getMsg():
+-        return jsonify(engine.getState())
+-    
+-    @app.route("/api/reset", methods=["GET"])
+-    def reset():
+-        engine.reset()
+-        return jsonify({"code": 200})
+-
+-    app.run(
+-        use_reloader=False,
+-        host="0.0.0.0",
+-        port=5000
+-    )
+-
+-if __name__ == '__main__':
+-    parser = argparse.ArgumentParser()
+-    parser.add_argument(
+-        '--cli', dest='cli', default=False, action='store_true',
+-        help="run web ui by default, if add --cli, run cli."
+-    )
+-    parser.add_argument("--kv_size", type=int, default=256)
+-    parser.add_argument(
+-        "--engine", type=str, default="acl",
+-        help="inference backend, onnx or acl"
+-    )
+-    parser.add_argument(
+-        "--sampling", type=str, default="top_k",
+-        help="sampling method, greedy, top_k or top_p"
+-    )
+-    parser.add_argument(
+-        "--sampling_value",type=float,default=10,
+-        help="if sampling method is seted to greedy, this argument will be ignored; if top_k, it means value of p; if top_p, it means value of p"
+-    )
+-    parser.add_argument(
+-        "--temperature",type=float,default=0.7,
+-        help="sampling temperature if sampling method is seted to greedy, this argument will be ignored."
+-    )
+-    parser.add_argument(
+-        "--hf-dir", type=str, default="/root/model/tiny-llama-1.1B", 
+-        help="path to huggingface model dir"
+-    )
+-    parser.add_argument(
+-        "--model", type=str, default="/root/model/tiny-llama-seq-1-key-256-int8.om", 
+-        help="path to onnx or om model"
+-    )
+-    args = parser.parse_args()
+-    cfg = InferenceConfig(
+-        hf_model_dir=args.hf_dir,
+-        model=args.model,
+-        max_cache_size=args.kv_size,
+-        sampling_method=args.sampling,
+-        sampling_value=args.sampling_value,
+-        temperature=args.temperature,
+-        session_type=args.engine,
+-    )
+-    engine = LlamaInterface(cfg)
+-    main(args.cli,engine)
+\ No newline at end of file
+diff -uNr ascend-llm/inference/readme.md ascend-llm-qwen/inference/readme.md
+--- ascend-llm/inference/readme.md	2024-09-04 19:21:03.080078900 +0800
++++ ascend-llm-qwen/inference/readme.md	1970-01-01 08:00:00.000000000 +0800
+@@ -1,16 +0,0 @@
+-# inference
+-
+-目前提供两种运行模式：
+-1. cli模式：在终端运行，每一次输入一行，一次性返回所有的推理结果。
+-2. web模式：前端代码在[github](https://github.com/yinghuo302/ascend-llm-web)或者[gitee](https://gitee.com/yinghuo302/ascend-llm-web)，打包出dist文件夹，放在inference文件夹下即可。
+-
+-```bash
+-cd inference
+-python main.py \
+-	--model <path_to_onnx_or_om_model> \
+-	--hf-dir <path_to_huggingface_model_dir> \ # 需要tokenizer和模型配置文件，权重不需要
+-	--engine <acl/onnx> 
+-	--sampling <greedy/top_p/top_k> --sampling_value <>  --temperature <> # 采样相关配置
+-	--cli # 添加--cli表示在终端运行
+-```
+-代码需要修改的部分主要在与config.py，可以根据注释修改。inference.py中关于输入格式和结束语判断的部分可能也需要根据具体的模型修改。
+\ No newline at end of file
+diff -uNr ascend-llm/inference/requirements.txt ascend-llm-qwen/inference/requirements.txt
+--- ascend-llm/inference/requirements.txt	2024-09-04 19:21:03.080078900 +0800
++++ ascend-llm-qwen/inference/requirements.txt	1970-01-01 08:00:00.000000000 +0800
+@@ -1,6 +0,0 @@
+-# onnxruntime or acl
+-onnxruntime
+-numpy
+-transformers
+-flask
+-flask_cors
+\ No newline at end of file
+diff -uNr ascend-llm/inference/session.py ascend-llm-qwen/inference/session.py
+--- ascend-llm/inference/session.py	2024-09-04 19:21:03.080078900 +0800
++++ ascend-llm-qwen/inference/session.py	1970-01-01 08:00:00.000000000 +0800
+@@ -1,87 +0,0 @@
+-from config import InferenceConfig
+-from kvcache import KVCache
+-import numpy as np
+-from typing import List
+-import time
+-import sys
+-class Session:
+-	def __init__(self,config:InferenceConfig) -> None:
+-		self.kvCache = KVCache.create(config)
+-		self.max_len = config.max_input_len
+-
+-	def run(self,input_ids:np.ndarray):
+-		pass
+-	
+-	@staticmethod
+-	def fromConfig(config:InferenceConfig) -> 'Session':
+-		if config.session_type == "onnx":
+-			return OnnxSession(config)
+-		elif config.session_type=='acl':
+-			return AclSession(config)
+-		else:
+-			return None
+-	
+-	def reset(self):
+-		self.kvCache.reset()
+-
+-	def rollback(self,seq_len):
+-		self.kvCache.rollback(seq_len)
+-
+-	def evict(self,space_need):
+-		self.kvCache.evict(space_need)
+-	
+-class OnnxSession(Session):
+-	def __init__(self,config:InferenceConfig)->None:
+-		super().__init__(config)
+-		import onnxruntime
+-		options = onnxruntime.SessionOptions()
+-		self.llm_session = onnxruntime.InferenceSession(
+-            config.model,
+-            sess_options=options,
+-            providers=[
+-                "DmlExecutionProvider",
+-                "CUDAExecutionProvider",
+-                "CPUExecutionProvider",
+-            ],
+-        )
+-
+-	def run(self,input_ids:np.ndarray):
+-		seq_len=input_ids.shape[-1]
+-		l,r,result = 0,self.max_len,None
+-		while l < seq_len:
+-			r = min(seq_len,r)
+-			cache,mask,pos_ids = self.kvCache.getInputs(r-l)
+-			result = self.llm_session.run(None,{
+-				"input_ids": input_ids[:,l:r],
+-				"attention_mask":mask,
+-				"past_key_values": cache,
+-				"position_ids": pos_ids,
+-			})
+-			# result:  [logits,key_values,attn_scores]
+-			self.kvCache.update(r-l,result[1],result[2])
+-			l , r = l+self.max_len , r + self.max_len
+-		return result
+-
+-class AclSession(Session):
+-	context = None
+-	def __init__(self,config:InferenceConfig)->None:
+-		super().__init__(config)
+-		from engine import ACLModel,initResource
+-		self.context = initResource(config.device)
+-		self.model = ACLModel(config.model,context=self.context,mode=config.acl_mode)
+-		self.input_ids = np.zeros((1,self.max_len),dtype=np.int64)
+-		if config.acl_mode == 'rc':
+-			self.input_ids,_,_,self.kvCache.kvCache = self.model.getInputs()
+-
+-	def run(self,input_ids:np.ndarray):
+-		seq_len=input_ids.shape[-1]
+-		l,r,result = 0,self.max_len,None
+-		while l < seq_len:
+-			r = min(seq_len,r)
+-			self.input_ids[:,:r-l] = input_ids[:,l:r]
+-			cache,mask,pos_ids = self.kvCache.getInputs(self.max_len)
+-			result:List[np.ndarray] = self.model.inference([self.input_ids,mask,pos_ids,cache])
+-			# result:  [logits,key_values,attn_scores]
+-			self.kvCache.update(r-l,result[1],result[2])
+-			l , r = l+self.max_len , r + self.max_len
+-		return result
+\ No newline at end of file
+diff -uNr ascend-llm/readme.md ascend-llm-qwen/readme.md
+--- ascend-llm/readme.md	2024-09-04 19:21:03.080078900 +0800
++++ ascend-llm-qwen/readme.md	1970-01-01 08:00:00.000000000 +0800
+@@ -1,135 +0,0 @@
+-# ascend-llm
+-
+-## 简介
+-
+-本项目基于昇腾310芯片部署大语言模型，目前已经成功运行meta-llama/Llama-2-7b-hf和TinyLlama/TinyLlama-1.1B-Chat-v1.0。
+-
+-本实践项目由南京大学计算机科学与技术系杜骋同学主导，由朱光辉老师进行指导，由昇腾CANN生态使能团队提供技术支持，并在昇腾开发者大会2024进行了展示。
+-
+-## 效果预览
+-
+-![](./assets/webui.png)
+-
+-
+-## 关键技术
+-- 静态图方案
+-
+-    在Transformer模型中，基于模型的自回归推理特性，业界普遍采用kvcache缓存的方式增加模型的推理性能。kvcache会缓存上一次推理得到的kv矩阵用于本次推理，大大减少了推理计算量。
+-    
+-    由于缓存的kv矩阵要和当前输入字符计算出的kv矩阵进行拼接，因此每次推理完整的kv矩阵长度一直在增加，致使模型shape不固定，会走动态推理流程，存在大量算子编译时间，推理性能大大下降。
+-    
+-    本方案基于原先动态图方案，将kv矩阵固定到一个最大长度，结合attention_mask屏蔽输入序列部分位置的特性实现了静态图的方案。在kvcache达到上限时通过KV缓存驱逐（[StreamingLLM](https://arxiv.org/abs/2309.17453)和[Heavy-Hitter Oracle](https://arxiv.org/abs/2306.14048)）让模型可以反复推理。
+-
+-- 量化方案
+-
+-    大模型权重过大，在端侧设备由于内存限制通常难以运行，因此通常将大模型权重从fp16量化到int8甚至int4降低内存消耗.
+-
+-    本项目采用平滑激活（[SmoothQuant](https://arxiv.org/abs/2211.10438)），动态混合精度分解（类似[LLM.int8](https://arxiv.org/abs/2208.07339)），静态混合精度分解量化方案，通过对权重和激活值均采用int8量化，显著节省了内存并提升了推理速度。
+-
+-
+-## 运行方式
+-
+-### 环境准备
+-
+-1. 昇腾软硬件解决方案(驱动+固件+CANN)
+-   
+-   前往[昇腾社区](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha002/cannquickstart/quickstart/instg_000021.html)，按照说明下载安装。
+-   或者下载[香橙派0318镜像](https://www.hiascend.com/forum/thread-0231149828762292018-1-1.html)，烧录到sd卡，启动环境，参考[香橙派AIpro快速上手指南](https://www.hiascend.com/forum/thread-0260140249549075069-1-1.html)。 
+-2. 第三方依赖
+-   
+-   模型导出和推理相关文件夹下requirements.txt，使用pip 进行安装。
+-   
+-   ```shell
+-   pip install -r requirements.txt
+-   ```
+-
+-本项目测试环境：香橙派AI pro，CANN 7.0/7.2，python 3.9。
+-
+-### 算子适配
+-
+- - protoc安装
+-	
+-	根据昇腾文档选择合适的protoc版本，protoc版本和CANN版本强相关。CANN7.0/7.2使用的protoc 1.13.0
+-    
+-	```
+-    # 安装protoc==1.13.0， 找一空闲目录下载
+-    wget  https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/wanzutao/tiny-llama/protobuf-all-3.13.0.tar.gz --no-check-certificate
+-    tar -zxvf protobuf-all-3.13.0.tar.gz
+-    cd protobuf-3.13.0
+-    apt-get update
+-    apt-get install autoconf automake libtool
+-    ./autogen.sh 
+-    ./configure
+-    make -j4
+-    make install
+-    sudo ldconfig
+-    protoc --version # 查看版本号
+-    ```
+-
+- - 算子编译部署
+-    ```
+-    # 将./custom_op/matmul_integer_plugin.cc 拷贝到指定路径
+-    cd tiny_llama
+-    export ASCEND_PATH=/usr/local/Ascend/ascend-toolkit/latest
+-    cp custom_op/matmul_integer_plugin.cc $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx/framework/onnx_plugin/
+-    cd $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx 
+-    ```
+-    打开build.sh，找到下面四个环境变量，解开注释并修改如下：
+-    ```
+-    export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/include
+-    export TOOLCHAIN_DIR=/usr
+-    export AICPU_KERNEL_TARGET=cust_aicpu_kernels
+-    export AICPU_SOC_VERSION=Ascend310B4
+-    ```
+- - 编译运行
+-    ```
+-    ./build.sh 
+-    cd build_out/
+-    ./custom_opp_ubuntu_aarch64.run
+-    # 生成文件到customize到默认目录 $ASCEND_PATH/opp/vendors/，删除冗余文件
+-    cd $ASCEND_PATH/opp/vendors/customize
+-    rm -rf op_impl/ op_proto/
+-    ```
+-
+-### 模型量化与导出
+-
+-导出的模型可以从[阿里云盘](https://www.alipan.com/s/ro1NDLjFxtf)中下载
+-
+-1. 导出onnx：将transformer库中的modeling_llama替换为export_llama文件下的[modeling_llama](./export_llama/modeling_llama_4.35.py)。通过一下命令将模型导出为onnx（相对路径均为相对export_llama.py文件）
+-	```bash
+-	python export_llama.py \
+-		--model <model_name_or_path> \
+-		--output <output_onnx_file_path> \
+-		--act-path <act_scales_file_path>
+-		--quant <quant_config_file_path>
+-	```
+-	模型量化具体见[readme](./export_llama/readme.md)。对于TinyLlama-1.1B建议采用per-token的absmax量化（即w8x8.py）或者平滑激活（即smooth.py）；对于Llama-2-7b-hf，建议采用静态混合精度分解（即sd.py）或者平滑激活+静态混合精度分解(即smsd.py)。已经测试的方案为TinyLlama-1.1B per-token的absmax量化，Llama-2-7b-hf 静态混合精度分解。
+-3. ATC模型转换
+-	``` bash
+-	atc --framework=5 --model="xxx.onnx"  --output="xxx" --input_format=ND --input_shape="input_ids:batch,seq_len;attention_mask:batch,seq_len+kv_len;position_ids:batch,seq_len;past_key_values:n_layer,2,batch,n_head,kv_len,head_dim" --log=debug --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype
+-	```
+-	上述的n_layer, n_head, head_dim变量由模型决定。对于Llama-2-7b，n_layer=32, n_head=32, head_dim=128；对于TinyLlama-1.1B，n_layer=22, n_head=4, head_dim=64
+-	
+-	对于batch, seq_len, kv_len, 请根据需要填入，建议设置batch=1, seq_len=1, kv_len=1024。如对于TinyLlama-1.1B
+-	
+-	```bash
+-	atc --framework=5 --model="./tiny-llama.onnx"  --output="tiny-llama" --input_format=ND --input_shape="input_ids:1,1;attention_mask:1,1025;position_ids:1,1;past_key_values:22,2,1,4,1024,64" --log=debug --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype
+-	```
+-	
+-	对于Llama-2-7b，ATC转换占用内存较大，建议采用其他设备转换，如采用香橙派进行模型转换可以`export MAX_COMPILE_CORE_NUMBER=1`和`export TE_PARALLEL_COMPILER=1`，并开swap分区（推理时请关闭swap，会影响性能）。
+-
+-### 模型推理运行 
+-
+-目前提供两种运行模式：
+-1. cli模式：在终端运行，每一次输入一行，一次性返回所有的推理结果。
+-2. web模式：前端代码在[github](https://github.com/yinghuo302/ascend-llm-web)或者[gitee](https://gitee.com/yinghuo302/ascend-llm-web)，打包出dist文件夹，放在inference文件夹下即可。
+-
+-```bash
+-cd inference
+-python main.py \
+-	--model <path_to_onnx_or_om_model> \
+-	--hf-dir <path_to_huggingface_model_dir>  \ # 需要tokenizer和模型配置文件，权重不需要 
+-	--engine <acl/onnx> \
+-	--sampling <greedy/top_p/top_k> --sampling_value <>  --temperature <> \ #采样相关配置
+-	--cli # 添加--cli表示在终端运行
+-```
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
new file mode 100644
index 0000000000..4481af301c
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
@@ -0,0 +1,70 @@
+--- modeling_qwen2.py	2024-09-04 22:30:47.490111800 +0800
++++ modeling_qwen2_export.py	2024-09-04 22:49:20.540908500 +0800
+@@ -162,6 +162,10 @@
+         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+ 
+     def forward(self, x, seq_len=None):
++        return(
++            self.cos_cached.to(dtype=x.dtype),
++            self.sin_cached.to(dtype=x.dtype),
++        )
+         # x: [bs, num_attention_heads, seq_len, head_size]
+         if seq_len > self.max_seq_len_cached:
+             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+@@ -312,6 +316,7 @@
+         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+ 
++        out_key_value = (key_states, value_states) if use_cache else None
+         if past_key_value is not None:
+             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+@@ -351,7 +356,7 @@
+         if not output_attentions:
+             attn_weights = None
+ 
+-        return attn_output, attn_weights, past_key_value
++        return attn_output, attn_weights, out_key_value
+ 
+ 
+ class Qwen2FlashAttention2(Qwen2Attention):
+@@ -895,7 +900,7 @@
+         # decoder layers
+         all_hidden_states = () if output_hidden_states else None
+         all_self_attns = () if output_attentions else None
+-        next_decoder_cache = None
++        next_decoder_cache = [] if use_cache else None
+ 
+         for decoder_layer in self.layers:
+             if output_hidden_states:
+@@ -926,7 +931,11 @@
+             hidden_states = layer_outputs[0]
+ 
+             if use_cache:
+-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
++                key_values = layer_outputs[2 if output_attentions else 1]
++                if isinstance(next_decoder_cache,tuple):
++                    next_decoder_cache=list(next_decoder_cache)
++                assert isinstance(next_decoder_cache, list),"transform failed"
++                next_decoder_cache.extend(layer_outputs[2 if output_hidden_states else 1])
+ 
+             if output_attentions:
+                 all_self_attns += (layer_outputs[1],)
+@@ -937,9 +946,7 @@
+         if output_hidden_states:
+             all_hidden_states += (hidden_states,)
+ 
+-        next_cache = None
+-        if use_cache:
+-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
++        next_cache = torch.extend(next_decoder_cache).reshape(len(self.layers),2,*next_decoder_cache[0].shape) if use_cache else None
+ 
+         if not return_dict:
+             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+@@ -1433,4 +1440,4 @@
+             logits=logits,
+             hidden_states=outputs.hidden_states,
+             attentions=outputs.attentions,
+-        )
+\ No newline at end of file
++        )
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
new file mode 100644
index 0000000000..9bc37458c6
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
@@ -0,0 +1,29 @@
+import onnx
+import onnx.helper as helper 
+from onnx import TensorProtos
+
+model = onnx.load("qwen2.onnx")
+new_nodes = []
+
+for node in model.graph.node:
+    new_nodes = node
+    if node.op_tyoe == "Trilu":
+        new_node = helper.make_node(
+            "Trilu",
+            inputs=[node.input[0]],
+            outputs=node.output,
+            upper=1
+        )
+    new_nodes.append(new_node)
+    
+new_graph = helper.make_graph(
+    new_nodes,
+    "new_graph",
+    inputs=model.graph.input,
+    outputs=model.graph.output,
+    value_info=model.graph.value_info,
+    initializer=model.graph.initializer
+)
+
+new_model = helper.make_model(new_graph, producer_name=model.producer_name, opset_imports=model.opset_import, ir_version=model.ir_version)
+onnx.save(new_model, "qwen2.onnx", save_as_external_data=True)
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
new file mode 100644
index 0000000000..43ad2ce423
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
@@ -0,0 +1,38 @@
+# README
+
+-此README对qwen2-7b 在310b环境离线模型导出与推理脚本及其使用方式进行介绍
+
+## 环境准备
+
+前往昇腾社区(https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha002/cannquickstart/quickstart/instg_000021.html)，按照说明下载安装。
+
+## 依赖
+
+### 脚本依赖
+
+需要下载ascend-llm（https://gitee.com/yinghuo302/ascend-llm）
+
+git clone https://gitee.com/yinghuo302/ascend-llm
+
+### 第三方依赖
+
+模型导出和推理依赖在requirement文件夹下requirements_export.txt,使用pip安装
+
+## 脚本更新
+
+通过diff.patch 更新ascend_llm 命令：patch -p0 < diff.patch
+
+## 模型更新 
+
+通过diff_model.patch 更新transformer库中modeling_qwen2 命令：patch -p0 < diff_model.patch 
+
+
+## 模型量化导出与推理运行
+
+命令参考ascend-llm中的README.md
+在onnx转om模型之间，需要将export_Trilu.py拷贝到onnx模型目录下，修改脚本中onnx名称与onnx对应执行脚本
+其中atc命令中的n_layer=28,n_head=4,head_dim=128
+
+## 精度测试
+
+先下载CEval，BoolQ,GSM8K数据集到inference目录下，将test.py 拷贝到/ascend_llm/inference/路径下 命令参考模型推理运行的命令
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
new file mode 100644
index 0000000000..62b2560a86
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
@@ -0,0 +1,5 @@
+torch==2.1.0
+torch_npu
+onnx
+transformers==4.38.2
+lm-eval==0.4.2 # for eval
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
new file mode 100644
index 0000000000..f1068d88f6
--- /dev/null
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
@@ -0,0 +1,62 @@
+import argparse
+from config import InferrnceConfig
+from inference import LlamaInterface
+
+def main(cli: bool, engine: LlamaInterface, dataset):
+    if cli:
+        if dataset == 'BoolQ':
+            engine.test_boolq()
+        elif dataset == 'CEval':
+            engine.test_ceval()
+        elif dataset == 'GSM8K':
+            engine.test_gsm8k()
+        else:
+            print("dataset is not support!")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--cli', dest='cli', default=False, action='store_true',
+        help="run web ui by default, if add --cli, run cli."
+    )
+    parser.add_argument("--kv_size", type=int, default=1024)
+    parser.add_argument(
+        "--engine", type=str, default="acl",
+        help="inference backend, onnx or acl"
+    )
+    parser.add_argument(
+        "--sampling", type=str, default="top_k",
+        help="sampling method, greedy, top_k or top_p"
+    )
+    parser.add_argument(
+        "--sampling_value", type=float,default=10,
+        help="if sampling method is seted to greedy, this argument will be ignored; if top_k, it means value of p; if top_p, it means value of p"
+    )
+    parser.add_argument(
+        "--temperature", type=float,default=0.7,
+        help="sampling temperature if sampling method is seted to greedy, this argument will be ignored."
+    )
+    parser.add_argument(
+        "--hf-dir", type=str, default="/root/model/tiny-llama-1.1B", 
+        help="path to huggingface model dir"
+    )
+    parser.add_argument(
+        "--model", type=str, default="/root/model/tiny-llama-seq-1-key-256-int8.om", 
+        help="path to onnx or om model"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default="BoolQ"
+    )
+    
+    args = parser.parse_args()
+    cfg = InferenceConfig(
+        hf_model_dir=args.hf_dir,
+        model=args.model,
+        max_cache_size=args.kv_size,
+        sampling_method=args.sampling,
+        sampling_value=args.sampling_value,
+        temperature=args.temperature,
+        session_type=args.engine,
+    )
+    engine = LlamaInterface(cfg)
+    main(args.cli,engine,args.dataset)
\ No newline at end of file
-- 
Gitee


From 697b4783fcb1b5a4ca04092aaaa878c0e5b5a70f Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Thu, 5 Sep 2024 03:39:22 +0000
Subject: [PATCH 2/9] update
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 .../built-in/nlp/Qwen_for_Pytorch/readme.md   | 223 +++++++++++++++---
 1 file changed, 193 insertions(+), 30 deletions(-)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
index 43ad2ce423..6964a3f5b9 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/readme.md
@@ -2,37 +2,200 @@
 
 -此README对qwen2-7b 在310b环境离线模型导出与推理脚本及其使用方式进行介绍
 
-## 环境准备
+- [概述](#ZH-CN_TOPIC_0000001172161501)
+
+    - [输入输出数据](#section540883920406)
+
+- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
+
+- [快速上手](#ZH-CN_TOPIC_0000001126281700)
+
+  - [获取源码](#section4622531142816)
+  - [模型转换与推理](#section741711594517)
+
+
+# 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
+
+通义千问是阿里云自主研发的超大规模语言模型，能够回答问题、创作文字，还能表达观点、撰写代码。
+
+## 输入输出数据<a name="section540883920406"></a>
+
+ 输入数据
+
+  | 输入数据        | 数据类型 | 大小                                 | 数据排布格式 |
+  | --------------- | -------- | ------------------------------------ | ------------ |
+  | input_ids       | int64    | 1 x 1                                | ND           |
+  | attention_mask  | int64    | 1 x 1                                | ND           |
+  | position_ids    | int64    | 1 x 1                                | ND           |
+  | past_key_values | int64    | layers,2,1,n_heads, kv_len, head_dim | ND           |
+
+- 输出数据
+
+  | 输出数据       | 数据类型 | 大小                          | 数据排布格式 |
+  | -------------- | -------- | ----------------------------- | ------------ |
+  | logits         | FLOAT32  | 1 x vocab_size                | ND           |
+  | out_key_values | FLOAT16  | layers,2,1,36,kv_len,head_dim | ND           |
+
+# 推理环境准备<a name="ZH-CN_TOPIC_0000001126281702"></a>
+
+  **表 1**  版本配套表
 
-前往昇腾社区(https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha002/cannquickstart/quickstart/instg_000021.html)，按照说明下载安装。
+  | 配套                                                 | 版本                | 取包地址环境准备指导                                                                                            |
+  | ---------------------------------------------------- | ------------------- | --------------------------------------------------------------------------------------------------------------- |
+  | 固件与驱动                                           | Ascend HDK 24.1.RC3 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies)           |
+  | CANN                                                 | CANN 8.0.RC3        | https://cmc-szv.clouddragon.huawei.com/cmcversion/index/releaseView?deltaId=10860207193326848&isSelect=Software |
+  | Python                                               | 3.9.19              | -                                                                                                               |
+  | PyTorch                                              | 2.1.0               | -                                                                                                               |
+  | 说明：310B推理卡请以CANN版本选择实际固件与驱动版本。 | \                   |
 
-## 依赖
+# 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
 
-### 脚本依赖
+## 获取源码<a name="section4622531142816"></a>
 
-需要下载ascend-llm（https://gitee.com/yinghuo302/ascend-llm）
-
-git clone https://gitee.com/yinghuo302/ascend-llm
-
-### 第三方依赖
-
-模型导出和推理依赖在requirement文件夹下requirements_export.txt,使用pip安装
-
-## 脚本更新
-
-通过diff.patch 更新ascend_llm 命令：patch -p0 < diff.patch
-
-## 模型更新 
-
-通过diff_model.patch 更新transformer库中modeling_qwen2 命令：patch -p0 < diff_model.patch 
-
-
-## 模型量化导出与推理运行
-
-命令参考ascend-llm中的README.md
-在onnx转om模型之间，需要将export_Trilu.py拷贝到onnx模型目录下，修改脚本中onnx名称与onnx对应执行脚本
-其中atc命令中的n_layer=28,n_head=4,head_dim=128
-
-## 精度测试
-
-先下载CEval，BoolQ,GSM8K数据集到inference目录下，将test.py 拷贝到/ascend_llm/inference/路径下 命令参考模型推理运行的命令
\ No newline at end of file
+1. 获取源码。
+
+   ```
+    # 获取源码 commitId ：1392d7f 此代码库已经不更新，可以按下面取最新版
+    git clone https://gitee.com/yinghuo302/ascend-llm
+    cd ascend-llm   
+    patch -p0 < diff.patch
+    cp $install_python_path/lib/site-packages/transformers/models/qwen2/modeling_qwen2.py .
+    patch -p0 < diff_model.patch
+    cp modeling_qwen2.py $install_python_path/lib/site-packages/transformers/models/qwen2
+   ```
+   
+2. 安装依赖。
+
+   ```
+   pip install -r requirements.txt
+   ```
+
+## 模型转换与推理<a name="section741711594517"></a>
+
+1. 环境搭建。
+
+   - protoc安装
+      
+      根据昇腾文档选择合适的protoc,此版本配套使用的protoc版本最低为 1.13.0  
+      进入https://github.com/protocolbuffers/protobuf/releases下载对应版本
+      ```
+      # 安装protoc==1.13.0， 找一空闲目录下载
+      tar -zxvf protobuf-all-3.13.0.tar.gz
+      cd protobuf-3.13.0
+      apt-get update
+      apt-get install autoconf automake libtool
+      ./autogen.sh 
+      ./configure
+      make -j4
+      make install
+      sudo ldconfig
+      protoc --version # 查看版本号
+      ```
+
+   - 算子编译部署
+      ```
+      # 将./custom_op/matmul_integer_plugin.cc 拷贝到指定路径
+      cd MiniCPM_for_Pytorch
+      export ASCEND_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      cp custom_op/matmul_integer_plugin.cc $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx/framework/onnx_plugin/
+      cd $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx 
+      ```
+      打开build.sh，找到下面四个环境变量，解开注释并修改如下：
+      ```
+      export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/include
+      export TOOLCHAIN_DIR=/usr
+      export AICPU_KERNEL_TARGET=cust_aicpu_kernels
+      export AICPU_SOC_VERSION=Ascend310B4
+      ```
+   - 编译运行
+      ```
+      ./build.sh 
+      cd build_out/
+      ./custom_opp_ubuntu_aarch64.run
+      # 生成文件到customize到默认目录 $ASCEND_PATH/opp/vendors/，删除冗余文件
+      cd $ASCEND_PATH/opp/vendors/customize
+      rm -rf op_impl/ op_proto/
+      ```
+
+2. 模型转换(进入export_llama目录)。
+
+
+   1). 导出onnx模型。
+
+       
+       python export_llama.py --model ${模型文件路径} --output ${输出onnx文件路径} 
+       
+       
+- 参数说明：  
+         - model_name: 模型名称  
+         - model_type: 模型类型  
+         - save_path: 模型权重保存文件夹  
+
+
+   2). 使用ATC工具将ONNX模型转OM模型。
+
+1. 配置环境变量。
+
+         
+          source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+
+2. 执行命令查看芯片名称（$\{chip\_name\}）。
+
+         
+         npu-smi info
+
+#该设备芯片名为Ascend310P3 （自行替换）  
+会显如下：  
+
+         +-------------------+-----------------+------------------------------------------------------+
+         | NPU     Name      | Health          | Power(W)     Temp(C)           Hugepages-Usage(page) |
+         | Chip    Device    | Bus-Id          | AICore(%)    Memory-Usage(MB)                        |
+         +===================+=================+======================================================+
+         | 0       310P3     | OK              | 15.8         42                0    / 0              |
+         | 0       0         | 0000:82:00.0    | 0            1074 / 21534                            |
+         +===================+=================+======================================================+
+         | 1       310P3     | OK              | 15.4         43                0    / 0              |
+         | 0       1         | 0000:89:00.0    | 0            1070 / 21534                            |
+         +===================+=================+======================================================+
+         ```
+
+3. 执行ATC命令。
+         
+          atc --framework=5 --model=${onnx文件路径}  --output=${输出文件名} --input_format=ND --input_shape="input_ids:1,1;attention_mask:1,1025;position_ids:1,1;past_key_values:28,2,1,4,1024,128" --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype
+          
+
+- 参数说明：
+
+           - model：为ONNX模型文件。  
+           - framework：5代表ONNX模型。  
+           - output：输出的OM模型。
+           - input\_format：输入数据的格式。
+           - input\_shape：输入数据的shape。
+           - log：日志级别。
+           - soc\_version：处理器型号。
+   
+
+           运行成功后生成om后缀的模型文件。
+
+3. 开始推理验证。  
+   1). 执行推理前准备工作:  
+        A）在端侧设备上如310B1 上安装对应cann，驱动等  
+        B）进入inference, 安装相关依赖 pip install -r requirements.txt  
+
+   2). 执行推理:
+
+        python main.py --model ${om文件路径}  --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli 
+
+- 参数说明：               
+             -   model：om模型路径  
+             -   hf-dir：需要tokenizer和模型配置文件，权重不需要   
+             -   engine：310B上只能acl  
+             -   sampling：greedy/top_p/top_k  
+             -   cli：表示在终端运行  
+             说明: 上面参数根据实际情况修改
+
+3.数据集精度验证:  
+       先下载CEval，BoolQ，GSM8K数据集到inference目录下，具体路径为./inference/dataset，将test.py文件放到inference路径下
+       
+         python test.py --model ${om文件路径} --hf-dir ${模型文件路径} --engine acl --sampling greedy --cli --dataset=BoolQ/CEval/GSM8K
\ No newline at end of file
-- 
Gitee


From f8a71575f556d8bce8dae3e68ad362761cc17851 Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Thu, 5 Sep 2024 03:43:06 +0000
Subject: [PATCH 3/9] update
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 .../nlp/Qwen_for_Pytorch/export_Trilu.py      | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
index 9bc37458c6..0147f8fdf5 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/export_Trilu.py
@@ -1,3 +1,32 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2017, 
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#  list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#  contributors may be used to endorse or promote products derived from
+#  this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import onnx
 import onnx.helper as helper 
 from onnx import TensorProtos
-- 
Gitee


From f6cb0daaea75a5a6ad4350d96981e22955b4204a Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Thu, 5 Sep 2024 07:04:02 +0000
Subject: [PATCH 4/9] update ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
index db05a35866..2d7f4c35f4 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/LICENCE
@@ -1,5 +1,6 @@
 BSD 3-Clause License
 
+
 Copyright (c) 2017, 
 All rights reserved.
 
-- 
Gitee


From f1d76b4a83071d6088eef026d7451f03f697714e Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Thu, 5 Sep 2024 07:24:01 +0000
Subject: [PATCH 5/9] update
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 .../built-in/nlp/Qwen_for_Pytorch/diff.patch  | 4480 +----------------
 1 file changed, 32 insertions(+), 4448 deletions(-)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
index 7e1fe724a0..e630797666 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff.patch
@@ -1,4458 +1,42 @@
-diff -uNr ascend-llm/.git/HEAD ascend-llm-qwen/.git/HEAD
---- ascend-llm/.git/HEAD	2024-09-04 19:28:06.528235700 +0800
-+++ ascend-llm-qwen/.git/HEAD	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--ref: refs/heads/lscno2
-diff -uNr ascend-llm/.git/config ascend-llm-qwen/.git/config
---- ascend-llm/.git/config	2024-09-04 19:29:48.493570100 +0800
-+++ ascend-llm-qwen/.git/config	1970-01-01 08:00:00.000000000 +0800
-@@ -1,15 +0,0 @@
--[core]
--	repositoryformatversion = 0
--	filemode = false
--	bare = false
--	logallrefupdates = true
--	symlinks = false
--	ignorecase = true
--[remote "origin"]
--	url = https://gitee.com/yinghuo302/ascend-llm.git
--	fetch = +refs/heads/*:refs/remotes/origin/*
--[branch "main"]
--	remote = origin
--	merge = refs/heads/main
--[branch "lscno2"]
--	vscode-merge-base = origin/main
-diff -uNr ascend-llm/.git/description ascend-llm-qwen/.git/description
---- ascend-llm/.git/description	2024-09-04 19:20:58.889995300 +0800
-+++ ascend-llm-qwen/.git/description	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--Unnamed repository; edit this file 'description' to name the repository.
-diff -uNr ascend-llm/.git/hooks/applypatch-msg.sample ascend-llm-qwen/.git/hooks/applypatch-msg.sample
---- ascend-llm/.git/hooks/applypatch-msg.sample	2024-09-04 19:20:58.889995300 +0800
-+++ ascend-llm-qwen/.git/hooks/applypatch-msg.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,15 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to check the commit log message taken by
--# applypatch from an e-mail message.
--#
--# The hook should exit with non-zero status after issuing an
--# appropriate message if it wants to stop the commit.  The hook is
--# allowed to edit the commit message file.
--#
--# To enable this hook, rename this file to "applypatch-msg".
--
--. git-sh-setup
--commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
--test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
--:
-diff -uNr ascend-llm/.git/hooks/commit-msg.sample ascend-llm-qwen/.git/hooks/commit-msg.sample
---- ascend-llm/.git/hooks/commit-msg.sample	2024-09-04 19:20:58.889995300 +0800
-+++ ascend-llm-qwen/.git/hooks/commit-msg.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,24 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to check the commit log message.
--# Called by "git commit" with one argument, the name of the file
--# that has the commit message.  The hook should exit with non-zero
--# status after issuing an appropriate message if it wants to stop the
--# commit.  The hook is allowed to edit the commit message file.
--#
--# To enable this hook, rename this file to "commit-msg".
--
--# Uncomment the below to add a Signed-off-by line to the message.
--# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
--# hook is more suited to it.
--#
--# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
--# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
--
--# This example catches duplicate Signed-off-by lines.
--
--test "" = "$(grep '^Signed-off-by: ' "$1" |
--	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
--	echo >&2 Duplicate Signed-off-by lines.
--	exit 1
--}
-diff -uNr ascend-llm/.git/hooks/fsmonitor-watchman.sample ascend-llm-qwen/.git/hooks/fsmonitor-watchman.sample
---- ascend-llm/.git/hooks/fsmonitor-watchman.sample	2024-09-04 19:20:58.893526000 +0800
-+++ ascend-llm-qwen/.git/hooks/fsmonitor-watchman.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,174 +0,0 @@
--#!/usr/bin/perl
--
--use strict;
--use warnings;
--use IPC::Open2;
--
--# An example hook script to integrate Watchman
--# (https://facebook.github.io/watchman/) with git to speed up detecting
--# new and modified files.
--#
--# The hook is passed a version (currently 2) and last update token
--# formatted as a string and outputs to stdout a new update token and
--# all files that have been modified since the update token. Paths must
--# be relative to the root of the working tree and separated by a single NUL.
--#
--# To enable this hook, rename this file to "query-watchman" and set
--# 'git config core.fsmonitor .git/hooks/query-watchman'
--#
--my ($version, $last_update_token) = @ARGV;
--
--# Uncomment for debugging
--# print STDERR "$0 $version $last_update_token\n";
--
--# Check the hook interface version
--if ($version ne 2) {
--	die "Unsupported query-fsmonitor hook version '$version'.\n" .
--	    "Falling back to scanning...\n";
--}
--
--my $git_work_tree = get_working_dir();
--
--my $retry = 1;
--
--my $json_pkg;
--eval {
--	require JSON::XS;
--	$json_pkg = "JSON::XS";
--	1;
--} or do {
--	require JSON::PP;
--	$json_pkg = "JSON::PP";
--};
--
--launch_watchman();
--
--sub launch_watchman {
--	my $o = watchman_query();
--	if (is_work_tree_watched($o)) {
--		output_result($o->{clock}, @{$o->{files}});
--	}
--}
--
--sub output_result {
--	my ($clockid, @files) = @_;
--
--	# Uncomment for debugging watchman output
--	# open (my $fh, ">", ".git/watchman-output.out");
--	# binmode $fh, ":utf8";
--	# print $fh "$clockid\n@files\n";
--	# close $fh;
--
--	binmode STDOUT, ":utf8";
--	print $clockid;
--	print "\0";
--	local $, = "\0";
--	print @files;
--}
--
--sub watchman_clock {
--	my $response = qx/watchman clock "$git_work_tree"/;
--	die "Failed to get clock id on '$git_work_tree'.\n" .
--		"Falling back to scanning...\n" if $? != 0;
--
--	return $json_pkg->new->utf8->decode($response);
--}
--
--sub watchman_query {
--	my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
--	or die "open2() failed: $!\n" .
--	"Falling back to scanning...\n";
--
--	# In the query expression below we're asking for names of files that
--	# changed since $last_update_token but not from the .git folder.
--	#
--	# To accomplish this, we're using the "since" generator to use the
--	# recency index to select candidate nodes and "fields" to limit the
--	# output to file names only. Then we're using the "expression" term to
--	# further constrain the results.
--	my $last_update_line = "";
--	if (substr($last_update_token, 0, 1) eq "c") {
--		$last_update_token = "\"$last_update_token\"";
--		$last_update_line = qq[\n"since": $last_update_token,];
--	}
--	my $query = <<"	END";
--		["query", "$git_work_tree", {$last_update_line
--			"fields": ["name"],
--			"expression": ["not", ["dirname", ".git"]]
--		}]
--	END
--
--	# Uncomment for debugging the watchman query
--	# open (my $fh, ">", ".git/watchman-query.json");
--	# print $fh $query;
--	# close $fh;
--
--	print CHLD_IN $query;
--	close CHLD_IN;
--	my $response = do {local $/; <CHLD_OUT>};
--
--	# Uncomment for debugging the watch response
--	# open ($fh, ">", ".git/watchman-response.json");
--	# print $fh $response;
--	# close $fh;
--
--	die "Watchman: command returned no output.\n" .
--	"Falling back to scanning...\n" if $response eq "";
--	die "Watchman: command returned invalid output: $response\n" .
--	"Falling back to scanning...\n" unless $response =~ /^\{/;
--
--	return $json_pkg->new->utf8->decode($response);
--}
--
--sub is_work_tree_watched {
--	my ($output) = @_;
--	my $error = $output->{error};
--	if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
--		$retry--;
--		my $response = qx/watchman watch "$git_work_tree"/;
--		die "Failed to make watchman watch '$git_work_tree'.\n" .
--		    "Falling back to scanning...\n" if $? != 0;
--		$output = $json_pkg->new->utf8->decode($response);
--		$error = $output->{error};
--		die "Watchman: $error.\n" .
--		"Falling back to scanning...\n" if $error;
--
--		# Uncomment for debugging watchman output
--		# open (my $fh, ">", ".git/watchman-output.out");
--		# close $fh;
--
--		# Watchman will always return all files on the first query so
--		# return the fast "everything is dirty" flag to git and do the
--		# Watchman query just to get it over with now so we won't pay
--		# the cost in git to look up each individual file.
--		my $o = watchman_clock();
--		$error = $output->{error};
--
--		die "Watchman: $error.\n" .
--		"Falling back to scanning...\n" if $error;
--
--		output_result($o->{clock}, ("/"));
--		$last_update_token = $o->{clock};
--
--		eval { launch_watchman() };
--		return 0;
--	}
--
--	die "Watchman: $error.\n" .
--	"Falling back to scanning...\n" if $error;
--
--	return 1;
--}
--
--sub get_working_dir {
--	my $working_dir;
--	if ($^O =~ 'msys' || $^O =~ 'cygwin') {
--		$working_dir = Win32::GetCwd();
--		$working_dir =~ tr/\\/\//;
--	} else {
--		require Cwd;
--		$working_dir = Cwd::cwd();
--	}
--
--	return $working_dir;
--}
-diff -uNr ascend-llm/.git/hooks/post-update.sample ascend-llm-qwen/.git/hooks/post-update.sample
---- ascend-llm/.git/hooks/post-update.sample	2024-09-04 19:20:58.893526000 +0800
-+++ ascend-llm-qwen/.git/hooks/post-update.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,8 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to prepare a packed repository for use over
--# dumb transports.
--#
--# To enable this hook, rename this file to "post-update".
--
--exec git update-server-info
-diff -uNr ascend-llm/.git/hooks/pre-applypatch.sample ascend-llm-qwen/.git/hooks/pre-applypatch.sample
---- ascend-llm/.git/hooks/pre-applypatch.sample	2024-09-04 19:20:58.893526000 +0800
-+++ ascend-llm-qwen/.git/hooks/pre-applypatch.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,14 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to verify what is about to be committed
--# by applypatch from an e-mail message.
--#
--# The hook should exit with non-zero status after issuing an
--# appropriate message if it wants to stop the commit.
--#
--# To enable this hook, rename this file to "pre-applypatch".
--
--. git-sh-setup
--precommit="$(git rev-parse --git-path hooks/pre-commit)"
--test -x "$precommit" && exec "$precommit" ${1+"$@"}
--:
-diff -uNr ascend-llm/.git/hooks/pre-commit.sample ascend-llm-qwen/.git/hooks/pre-commit.sample
---- ascend-llm/.git/hooks/pre-commit.sample	2024-09-04 19:20:58.894662300 +0800
-+++ ascend-llm-qwen/.git/hooks/pre-commit.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,49 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to verify what is about to be committed.
--# Called by "git commit" with no arguments.  The hook should
--# exit with non-zero status after issuing an appropriate message if
--# it wants to stop the commit.
--#
--# To enable this hook, rename this file to "pre-commit".
--
--if git rev-parse --verify HEAD >/dev/null 2>&1
--then
--	against=HEAD
--else
--	# Initial commit: diff against an empty tree object
--	against=$(git hash-object -t tree /dev/null)
--fi
--
--# If you want to allow non-ASCII filenames set this variable to true.
--allownonascii=$(git config --type=bool hooks.allownonascii)
--
--# Redirect output to stderr.
--exec 1>&2
--
--# Cross platform projects tend to avoid non-ASCII filenames; prevent
--# them from being added to the repository. We exploit the fact that the
--# printable range starts at the space character and ends with tilde.
--if [ "$allownonascii" != "true" ] &&
--	# Note that the use of brackets around a tr range is ok here, (it's
--	# even required, for portability to Solaris 10's /usr/bin/tr), since
--	# the square bracket bytes happen to fall in the designated range.
--	test $(git diff-index --cached --name-only --diff-filter=A -z $against |
--	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
--then
--	cat <<\EOF
--Error: Attempt to add a non-ASCII file name.
--
--This can cause problems if you want to work with people on other platforms.
--
--To be portable it is advisable to rename the file.
--
--If you know what you are doing you can disable this check using:
--
--  git config hooks.allownonascii true
--EOF
--	exit 1
--fi
--
--# If there are whitespace errors, print the offending file names and fail.
--exec git diff-index --check --cached $against --
-diff -uNr ascend-llm/.git/hooks/pre-merge-commit.sample ascend-llm-qwen/.git/hooks/pre-merge-commit.sample
---- ascend-llm/.git/hooks/pre-merge-commit.sample	2024-09-04 19:20:58.894662300 +0800
-+++ ascend-llm-qwen/.git/hooks/pre-merge-commit.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,13 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to verify what is about to be committed.
--# Called by "git merge" with no arguments.  The hook should
--# exit with non-zero status after issuing an appropriate message to
--# stderr if it wants to stop the merge commit.
--#
--# To enable this hook, rename this file to "pre-merge-commit".
--
--. git-sh-setup
--test -x "$GIT_DIR/hooks/pre-commit" &&
--        exec "$GIT_DIR/hooks/pre-commit"
--:
-diff -uNr ascend-llm/.git/hooks/pre-push.sample ascend-llm-qwen/.git/hooks/pre-push.sample
---- ascend-llm/.git/hooks/pre-push.sample	2024-09-04 19:20:58.894662300 +0800
-+++ ascend-llm-qwen/.git/hooks/pre-push.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,53 +0,0 @@
--#!/bin/sh
--
--# An example hook script to verify what is about to be pushed.  Called by "git
--# push" after it has checked the remote status, but before anything has been
--# pushed.  If this script exits with a non-zero status nothing will be pushed.
--#
--# This hook is called with the following parameters:
--#
--# $1 -- Name of the remote to which the push is being done
--# $2 -- URL to which the push is being done
--#
--# If pushing without using a named remote those arguments will be equal.
--#
--# Information about the commits which are being pushed is supplied as lines to
--# the standard input in the form:
--#
--#   <local ref> <local oid> <remote ref> <remote oid>
--#
--# This sample shows how to prevent push of commits where the log message starts
--# with "WIP" (work in progress).
--
--remote="$1"
--url="$2"
--
--zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
--
--while read local_ref local_oid remote_ref remote_oid
--do
--	if test "$local_oid" = "$zero"
--	then
--		# Handle delete
--		:
--	else
--		if test "$remote_oid" = "$zero"
--		then
--			# New branch, examine all commits
--			range="$local_oid"
--		else
--			# Update to existing branch, examine new commits
--			range="$remote_oid..$local_oid"
--		fi
--
--		# Check for WIP commit
--		commit=$(git rev-list -n 1 --grep '^WIP' "$range")
--		if test -n "$commit"
--		then
--			echo >&2 "Found WIP commit in $local_ref, not pushing"
--			exit 1
--		fi
--	fi
--done
--
--exit 0
-diff -uNr ascend-llm/.git/hooks/pre-rebase.sample ascend-llm-qwen/.git/hooks/pre-rebase.sample
---- ascend-llm/.git/hooks/pre-rebase.sample	2024-09-04 19:20:58.894662300 +0800
-+++ ascend-llm-qwen/.git/hooks/pre-rebase.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,169 +0,0 @@
--#!/bin/sh
--#
--# Copyright (c) 2006, 2008 Junio C Hamano
--#
--# The "pre-rebase" hook is run just before "git rebase" starts doing
--# its job, and can prevent the command from running by exiting with
--# non-zero status.
--#
--# The hook is called with the following parameters:
--#
--# $1 -- the upstream the series was forked from.
--# $2 -- the branch being rebased (or empty when rebasing the current branch).
--#
--# This sample shows how to prevent topic branches that are already
--# merged to 'next' branch from getting rebased, because allowing it
--# would result in rebasing already published history.
--
--publish=next
--basebranch="$1"
--if test "$#" = 2
--then
--	topic="refs/heads/$2"
--else
--	topic=`git symbolic-ref HEAD` ||
--	exit 0 ;# we do not interrupt rebasing detached HEAD
--fi
--
--case "$topic" in
--refs/heads/??/*)
--	;;
--*)
--	exit 0 ;# we do not interrupt others.
--	;;
--esac
--
--# Now we are dealing with a topic branch being rebased
--# on top of master.  Is it OK to rebase it?
--
--# Does the topic really exist?
--git show-ref -q "$topic" || {
--	echo >&2 "No such branch $topic"
--	exit 1
--}
--
--# Is topic fully merged to master?
--not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
--if test -z "$not_in_master"
--then
--	echo >&2 "$topic is fully merged to master; better remove it."
--	exit 1 ;# we could allow it, but there is no point.
--fi
--
--# Is topic ever merged to next?  If so you should not be rebasing it.
--only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
--only_next_2=`git rev-list ^master           ${publish} | sort`
--if test "$only_next_1" = "$only_next_2"
--then
--	not_in_topic=`git rev-list "^$topic" master`
--	if test -z "$not_in_topic"
--	then
--		echo >&2 "$topic is already up to date with master"
--		exit 1 ;# we could allow it, but there is no point.
--	else
--		exit 0
--	fi
--else
--	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
--	/usr/bin/perl -e '
--		my $topic = $ARGV[0];
--		my $msg = "* $topic has commits already merged to public branch:\n";
--		my (%not_in_next) = map {
--			/^([0-9a-f]+) /;
--			($1 => 1);
--		} split(/\n/, $ARGV[1]);
--		for my $elem (map {
--				/^([0-9a-f]+) (.*)$/;
--				[$1 => $2];
--			} split(/\n/, $ARGV[2])) {
--			if (!exists $not_in_next{$elem->[0]}) {
--				if ($msg) {
--					print STDERR $msg;
--					undef $msg;
--				}
--				print STDERR " $elem->[1]\n";
--			}
--		}
--	' "$topic" "$not_in_next" "$not_in_master"
--	exit 1
--fi
--
--<<\DOC_END
--
--This sample hook safeguards topic branches that have been
--published from being rewound.
--
--The workflow assumed here is:
--
-- * Once a topic branch forks from "master", "master" is never
--   merged into it again (either directly or indirectly).
--
-- * Once a topic branch is fully cooked and merged into "master",
--   it is deleted.  If you need to build on top of it to correct
--   earlier mistakes, a new topic branch is created by forking at
--   the tip of the "master".  This is not strictly necessary, but
--   it makes it easier to keep your history simple.
--
-- * Whenever you need to test or publish your changes to topic
--   branches, merge them into "next" branch.
--
--The script, being an example, hardcodes the publish branch name
--to be "next", but it is trivial to make it configurable via
--$GIT_DIR/config mechanism.
--
--With this workflow, you would want to know:
--
--(1) ... if a topic branch has ever been merged to "next".  Young
--    topic branches can have stupid mistakes you would rather
--    clean up before publishing, and things that have not been
--    merged into other branches can be easily rebased without
--    affecting other people.  But once it is published, you would
--    not want to rewind it.
--
--(2) ... if a topic branch has been fully merged to "master".
--    Then you can delete it.  More importantly, you should not
--    build on top of it -- other people may already want to
--    change things related to the topic as patches against your
--    "master", so if you need further changes, it is better to
--    fork the topic (perhaps with the same name) afresh from the
--    tip of "master".
--
--Let's look at this example:
--
--		   o---o---o---o---o---o---o---o---o---o "next"
--		  /       /           /           /
--		 /   a---a---b A     /           /
--		/   /               /           /
--	       /   /   c---c---c---c B         /
--	      /   /   /             \         /
--	     /   /   /   b---b C     \       /
--	    /   /   /   /             \     /
--    ---o---o---o---o---o---o---o---o---o---o---o "master"
--
--
--A, B and C are topic branches.
--
-- * A has one fix since it was merged up to "next".
--
-- * B has finished.  It has been fully merged up to "master" and "next",
--   and is ready to be deleted.
--
-- * C has not merged to "next" at all.
--
--We would want to allow C to be rebased, refuse A, and encourage
--B to be deleted.
--
--To compute (1):
--
--	git rev-list ^master ^topic next
--	git rev-list ^master        next
--
--	if these match, topic has not merged in next at all.
--
--To compute (2):
--
--	git rev-list master..topic
--
--	if this is empty, it is fully merged to "master".
--
--DOC_END
-diff -uNr ascend-llm/.git/hooks/pre-receive.sample ascend-llm-qwen/.git/hooks/pre-receive.sample
---- ascend-llm/.git/hooks/pre-receive.sample	2024-09-04 19:20:58.894662300 +0800
-+++ ascend-llm-qwen/.git/hooks/pre-receive.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,24 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to make use of push options.
--# The example simply echoes all push options that start with 'echoback='
--# and rejects all pushes when the "reject" push option is used.
--#
--# To enable this hook, rename this file to "pre-receive".
--
--if test -n "$GIT_PUSH_OPTION_COUNT"
--then
--	i=0
--	while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
--	do
--		eval "value=\$GIT_PUSH_OPTION_$i"
--		case "$value" in
--		echoback=*)
--			echo "echo from the pre-receive-hook: ${value#*=}" >&2
--			;;
--		reject)
--			exit 1
--		esac
--		i=$((i + 1))
--	done
--fi
-diff -uNr ascend-llm/.git/hooks/prepare-commit-msg.sample ascend-llm-qwen/.git/hooks/prepare-commit-msg.sample
---- ascend-llm/.git/hooks/prepare-commit-msg.sample	2024-09-04 19:20:58.894662300 +0800
-+++ ascend-llm-qwen/.git/hooks/prepare-commit-msg.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,42 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to prepare the commit log message.
--# Called by "git commit" with the name of the file that has the
--# commit message, followed by the description of the commit
--# message's source.  The hook's purpose is to edit the commit
--# message file.  If the hook fails with a non-zero status,
--# the commit is aborted.
--#
--# To enable this hook, rename this file to "prepare-commit-msg".
--
--# This hook includes three examples. The first one removes the
--# "# Please enter the commit message..." help message.
--#
--# The second includes the output of "git diff --name-status -r"
--# into the message, just before the "git status" output.  It is
--# commented because it doesn't cope with --amend or with squashed
--# commits.
--#
--# The third example adds a Signed-off-by line to the message, that can
--# still be edited.  This is rarely a good idea.
--
--COMMIT_MSG_FILE=$1
--COMMIT_SOURCE=$2
--SHA1=$3
--
--/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
--
--# case "$COMMIT_SOURCE,$SHA1" in
--#  ,|template,)
--#    /usr/bin/perl -i.bak -pe '
--#       print "\n" . `git diff --cached --name-status -r`
--# 	 if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
--#  *) ;;
--# esac
--
--# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
--# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
--# if test -z "$COMMIT_SOURCE"
--# then
--#   /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
--# fi
-diff -uNr ascend-llm/.git/hooks/push-to-checkout.sample ascend-llm-qwen/.git/hooks/push-to-checkout.sample
---- ascend-llm/.git/hooks/push-to-checkout.sample	2024-09-04 19:20:58.895659900 +0800
-+++ ascend-llm-qwen/.git/hooks/push-to-checkout.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,78 +0,0 @@
--#!/bin/sh
--
--# An example hook script to update a checked-out tree on a git push.
--#
--# This hook is invoked by git-receive-pack(1) when it reacts to git
--# push and updates reference(s) in its repository, and when the push
--# tries to update the branch that is currently checked out and the
--# receive.denyCurrentBranch configuration variable is set to
--# updateInstead.
--#
--# By default, such a push is refused if the working tree and the index
--# of the remote repository has any difference from the currently
--# checked out commit; when both the working tree and the index match
--# the current commit, they are updated to match the newly pushed tip
--# of the branch. This hook is to be used to override the default
--# behaviour; however the code below reimplements the default behaviour
--# as a starting point for convenient modification.
--#
--# The hook receives the commit with which the tip of the current
--# branch is going to be updated:
--commit=$1
--
--# It can exit with a non-zero status to refuse the push (when it does
--# so, it must not modify the index or the working tree).
--die () {
--	echo >&2 "$*"
--	exit 1
--}
--
--# Or it can make any necessary changes to the working tree and to the
--# index to bring them to the desired state when the tip of the current
--# branch is updated to the new commit, and exit with a zero status.
--#
--# For example, the hook can simply run git read-tree -u -m HEAD "$1"
--# in order to emulate git fetch that is run in the reverse direction
--# with git push, as the two-tree form of git read-tree -u -m is
--# essentially the same as git switch or git checkout that switches
--# branches while keeping the local changes in the working tree that do
--# not interfere with the difference between the branches.
--
--# The below is a more-or-less exact translation to shell of the C code
--# for the default behaviour for git's push-to-checkout hook defined in
--# the push_to_deploy() function in builtin/receive-pack.c.
--#
--# Note that the hook will be executed from the repository directory,
--# not from the working tree, so if you want to perform operations on
--# the working tree, you will have to adapt your code accordingly, e.g.
--# by adding "cd .." or using relative paths.
--
--if ! git update-index -q --ignore-submodules --refresh
--then
--	die "Up-to-date check failed"
--fi
--
--if ! git diff-files --quiet --ignore-submodules --
--then
--	die "Working directory has unstaged changes"
--fi
--
--# This is a rough translation of:
--#
--#   head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
--if git cat-file -e HEAD 2>/dev/null
--then
--	head=HEAD
--else
--	head=$(git hash-object -t tree --stdin </dev/null)
--fi
--
--if ! git diff-index --quiet --cached --ignore-submodules $head --
--then
--	die "Working directory has staged changes"
--fi
--
--if ! git read-tree -u -m "$commit"
--then
--	die "Could not update working tree to new HEAD"
--fi
-diff -uNr ascend-llm/.git/hooks/sendemail-validate.sample ascend-llm-qwen/.git/hooks/sendemail-validate.sample
---- ascend-llm/.git/hooks/sendemail-validate.sample	2024-09-04 19:20:58.895659900 +0800
-+++ ascend-llm-qwen/.git/hooks/sendemail-validate.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,77 +0,0 @@
--#!/bin/sh
--
--# An example hook script to validate a patch (and/or patch series) before
--# sending it via email.
--#
--# The hook should exit with non-zero status after issuing an appropriate
--# message if it wants to prevent the email(s) from being sent.
--#
--# To enable this hook, rename this file to "sendemail-validate".
--#
--# By default, it will only check that the patch(es) can be applied on top of
--# the default upstream branch without conflicts in a secondary worktree. After
--# validation (successful or not) of the last patch of a series, the worktree
--# will be deleted.
--#
--# The following config variables can be set to change the default remote and
--# remote ref that are used to apply the patches against:
--#
--#   sendemail.validateRemote (default: origin)
--#   sendemail.validateRemoteRef (default: HEAD)
--#
--# Replace the TODO placeholders with appropriate checks according to your
--# needs.
--
--validate_cover_letter () {
--	file="$1"
--	# TODO: Replace with appropriate checks (e.g. spell checking).
--	true
--}
--
--validate_patch () {
--	file="$1"
--	# Ensure that the patch applies without conflicts.
--	git am -3 "$file" || return
--	# TODO: Replace with appropriate checks for this patch
--	# (e.g. checkpatch.pl).
--	true
--}
--
--validate_series () {
--	# TODO: Replace with appropriate checks for the whole series
--	# (e.g. quick build, coding style checks, etc.).
--	true
--}
--
--# main -------------------------------------------------------------------------
--
--if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
--then
--	remote=$(git config --default origin --get sendemail.validateRemote) &&
--	ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
--	worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
--	git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
--	git config --replace-all sendemail.validateWorktree "$worktree"
--else
--	worktree=$(git config --get sendemail.validateWorktree)
--fi || {
--	echo "sendemail-validate: error: failed to prepare worktree" >&2
--	exit 1
--}
--
--unset GIT_DIR GIT_WORK_TREE
--cd "$worktree" &&
--
--if grep -q "^diff --git " "$1"
--then
--	validate_patch "$1"
--else
--	validate_cover_letter "$1"
--fi &&
--
--if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
--then
--	git config --unset-all sendemail.validateWorktree &&
--	trap 'git worktree remove -ff "$worktree"' EXIT &&
--	validate_series
--fi
-diff -uNr ascend-llm/.git/hooks/update.sample ascend-llm-qwen/.git/hooks/update.sample
---- ascend-llm/.git/hooks/update.sample	2024-09-04 19:20:58.895659900 +0800
-+++ ascend-llm-qwen/.git/hooks/update.sample	1970-01-01 08:00:00.000000000 +0800
-@@ -1,128 +0,0 @@
--#!/bin/sh
--#
--# An example hook script to block unannotated tags from entering.
--# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
--#
--# To enable this hook, rename this file to "update".
--#
--# Config
--# ------
--# hooks.allowunannotated
--#   This boolean sets whether unannotated tags will be allowed into the
--#   repository.  By default they won't be.
--# hooks.allowdeletetag
--#   This boolean sets whether deleting tags will be allowed in the
--#   repository.  By default they won't be.
--# hooks.allowmodifytag
--#   This boolean sets whether a tag may be modified after creation. By default
--#   it won't be.
--# hooks.allowdeletebranch
--#   This boolean sets whether deleting branches will be allowed in the
--#   repository.  By default they won't be.
--# hooks.denycreatebranch
--#   This boolean sets whether remotely creating branches will be denied
--#   in the repository.  By default this is allowed.
--#
--
--# --- Command line
--refname="$1"
--oldrev="$2"
--newrev="$3"
--
--# --- Safety check
--if [ -z "$GIT_DIR" ]; then
--	echo "Don't run this script from the command line." >&2
--	echo " (if you want, you could supply GIT_DIR then run" >&2
--	echo "  $0 <ref> <oldrev> <newrev>)" >&2
--	exit 1
--fi
--
--if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
--	echo "usage: $0 <ref> <oldrev> <newrev>" >&2
--	exit 1
--fi
--
--# --- Config
--allowunannotated=$(git config --type=bool hooks.allowunannotated)
--allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
--denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
--allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
--allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
--
--# check for no description
--projectdesc=$(sed -e '1q' "$GIT_DIR/description")
--case "$projectdesc" in
--"Unnamed repository"* | "")
--	echo "*** Project description file hasn't been set" >&2
--	exit 1
--	;;
--esac
--
--# --- Check types
--# if $newrev is 0000...0000, it's a commit to delete a ref.
--zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
--if [ "$newrev" = "$zero" ]; then
--	newrev_type=delete
--else
--	newrev_type=$(git cat-file -t $newrev)
--fi
--
--case "$refname","$newrev_type" in
--	refs/tags/*,commit)
--		# un-annotated tag
--		short_refname=${refname##refs/tags/}
--		if [ "$allowunannotated" != "true" ]; then
--			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
--			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
--			exit 1
--		fi
--		;;
--	refs/tags/*,delete)
--		# delete tag
--		if [ "$allowdeletetag" != "true" ]; then
--			echo "*** Deleting a tag is not allowed in this repository" >&2
--			exit 1
--		fi
--		;;
--	refs/tags/*,tag)
--		# annotated tag
--		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
--		then
--			echo "*** Tag '$refname' already exists." >&2
--			echo "*** Modifying a tag is not allowed in this repository." >&2
--			exit 1
--		fi
--		;;
--	refs/heads/*,commit)
--		# branch
--		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
--			echo "*** Creating a branch is not allowed in this repository" >&2
--			exit 1
--		fi
--		;;
--	refs/heads/*,delete)
--		# delete branch
--		if [ "$allowdeletebranch" != "true" ]; then
--			echo "*** Deleting a branch is not allowed in this repository" >&2
--			exit 1
--		fi
--		;;
--	refs/remotes/*,commit)
--		# tracking branch
--		;;
--	refs/remotes/*,delete)
--		# delete tracking branch
--		if [ "$allowdeletebranch" != "true" ]; then
--			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
--			exit 1
--		fi
--		;;
--	*)
--		# Anything else (is there anything else?)
--		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
--		exit 1
--		;;
--esac
--
--# --- Finished
--exit 0
-Binary files ascend-llm/.git/index and ascend-llm-qwen/.git/index differ
-diff -uNr ascend-llm/.git/info/exclude ascend-llm-qwen/.git/info/exclude
---- ascend-llm/.git/info/exclude	2024-09-04 19:20:58.895659900 +0800
-+++ ascend-llm-qwen/.git/info/exclude	1970-01-01 08:00:00.000000000 +0800
-@@ -1,6 +0,0 @@
--# git ls-files --others --exclude-from=.git/info/exclude
--# Lines that start with '#' are comments.
--# For a project mostly in C, the following would be a good set of
--# exclude patterns (uncomment them if you want to use them):
--# *.[oa]
--# *~
-diff -uNr ascend-llm/.git/logs/HEAD ascend-llm-qwen/.git/logs/HEAD
---- ascend-llm/.git/logs/HEAD	2024-09-04 19:28:06.528235700 +0800
-+++ ascend-llm-qwen/.git/logs/HEAD	1970-01-01 08:00:00.000000000 +0800
-@@ -1,2 +0,0 @@
--0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725448863 +0800	clone: from https://gitee.com/yinghuo302/ascend-llm.git
--1392d7fccbf5fbf1bf4df781cca919abd046a80d 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725449286 +0800	checkout: moving from main to lscno2
-diff -uNr ascend-llm/.git/logs/refs/heads/lscno2 ascend-llm-qwen/.git/logs/refs/heads/lscno2
---- ascend-llm/.git/logs/refs/heads/lscno2	2024-09-04 19:27:54.495854600 +0800
-+++ ascend-llm-qwen/.git/logs/refs/heads/lscno2	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725449274 +0800	branch: Created from main
-diff -uNr ascend-llm/.git/logs/refs/heads/main ascend-llm-qwen/.git/logs/refs/heads/main
---- ascend-llm/.git/logs/refs/heads/main	2024-09-04 19:21:03.028244000 +0800
-+++ ascend-llm-qwen/.git/logs/refs/heads/main	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725448863 +0800	clone: from https://gitee.com/yinghuo302/ascend-llm.git
-diff -uNr ascend-llm/.git/logs/refs/remotes/origin/HEAD ascend-llm-qwen/.git/logs/refs/remotes/origin/HEAD
---- ascend-llm/.git/logs/refs/remotes/origin/HEAD	2024-09-04 19:21:03.022263900 +0800
-+++ ascend-llm-qwen/.git/logs/refs/remotes/origin/HEAD	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--0000000000000000000000000000000000000000 1392d7fccbf5fbf1bf4df781cca919abd046a80d tangxian <tangxian8@huawei.com> 1725448863 +0800	clone: from https://gitee.com/yinghuo302/ascend-llm.git
-Binary files ascend-llm/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.idx and ascend-llm-qwen/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.idx differ
-Binary files ascend-llm/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.pack and ascend-llm-qwen/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.pack differ
-Binary files ascend-llm/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.rev and ascend-llm-qwen/.git/objects/pack/pack-65015997362f121155c3c86b628effe2d83efe4b.rev differ
-diff -uNr ascend-llm/.git/packed-refs ascend-llm-qwen/.git/packed-refs
---- ascend-llm/.git/packed-refs	2024-09-04 19:21:03.020279000 +0800
-+++ ascend-llm-qwen/.git/packed-refs	1970-01-01 08:00:00.000000000 +0800
-@@ -1,2 +0,0 @@
--# pack-refs with: peeled fully-peeled sorted 
--1392d7fccbf5fbf1bf4df781cca919abd046a80d refs/remotes/origin/main
-diff -uNr ascend-llm/.git/refs/heads/lscno2 ascend-llm-qwen/.git/refs/heads/lscno2
---- ascend-llm/.git/refs/heads/lscno2	2024-09-04 19:27:54.495854600 +0800
-+++ ascend-llm-qwen/.git/refs/heads/lscno2	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--1392d7fccbf5fbf1bf4df781cca919abd046a80d
-diff -uNr ascend-llm/.git/refs/heads/main ascend-llm-qwen/.git/refs/heads/main
---- ascend-llm/.git/refs/heads/main	2024-09-04 19:21:03.028244000 +0800
-+++ ascend-llm-qwen/.git/refs/heads/main	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--1392d7fccbf5fbf1bf4df781cca919abd046a80d
-diff -uNr ascend-llm/.git/refs/remotes/origin/HEAD ascend-llm-qwen/.git/refs/remotes/origin/HEAD
---- ascend-llm/.git/refs/remotes/origin/HEAD	2024-09-04 19:21:03.022263900 +0800
-+++ ascend-llm-qwen/.git/refs/remotes/origin/HEAD	1970-01-01 08:00:00.000000000 +0800
-@@ -1 +0,0 @@
--ref: refs/remotes/origin/main
-diff -uNr ascend-llm/.gitignore ascend-llm-qwen/.gitignore
---- ascend-llm/.gitignore	2024-09-04 19:21:03.040203700 +0800
-+++ ascend-llm-qwen/.gitignore	1970-01-01 08:00:00.000000000 +0800
-@@ -1,2 +0,0 @@
--*.pyc
--inference/dist/*
-\ No newline at end of file
-diff -uNr ascend-llm/LICENSE ascend-llm-qwen/LICENSE
---- ascend-llm/LICENSE	2024-09-04 19:21:03.040203700 +0800
-+++ ascend-llm-qwen/LICENSE	1970-01-01 08:00:00.000000000 +0800
-@@ -1,201 +0,0 @@
--                                 Apache License
--                           Version 2.0, January 2004
--                        http://www.apache.org/licenses/
--
--   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
--
--   1. Definitions.
--
--      "License" shall mean the terms and conditions for use, reproduction,
--      and distribution as defined by Sections 1 through 9 of this document.
--
--      "Licensor" shall mean the copyright owner or entity authorized by
--      the copyright owner that is granting the License.
--
--      "Legal Entity" shall mean the union of the acting entity and all
--      other entities that control, are controlled by, or are under common
--      control with that entity. For the purposes of this definition,
--      "control" means (i) the power, direct or indirect, to cause the
--      direction or management of such entity, whether by contract or
--      otherwise, or (ii) ownership of fifty percent (50%) or more of the
--      outstanding shares, or (iii) beneficial ownership of such entity.
--
--      "You" (or "Your") shall mean an individual or Legal Entity
--      exercising permissions granted by this License.
--
--      "Source" form shall mean the preferred form for making modifications,
--      including but not limited to software source code, documentation
--      source, and configuration files.
--
--      "Object" form shall mean any form resulting from mechanical
--      transformation or translation of a Source form, including but
--      not limited to compiled object code, generated documentation,
--      and conversions to other media types.
--
--      "Work" shall mean the work of authorship, whether in Source or
--      Object form, made available under the License, as indicated by a
--      copyright notice that is included in or attached to the work
--      (an example is provided in the Appendix below).
--
--      "Derivative Works" shall mean any work, whether in Source or Object
--      form, that is based on (or derived from) the Work and for which the
--      editorial revisions, annotations, elaborations, or other modifications
--      represent, as a whole, an original work of authorship. For the purposes
--      of this License, Derivative Works shall not include works that remain
--      separable from, or merely link (or bind by name) to the interfaces of,
--      the Work and Derivative Works thereof.
--
--      "Contribution" shall mean any work of authorship, including
--      the original version of the Work and any modifications or additions
--      to that Work or Derivative Works thereof, that is intentionally
--      submitted to Licensor for inclusion in the Work by the copyright owner
--      or by an individual or Legal Entity authorized to submit on behalf of
--      the copyright owner. For the purposes of this definition, "submitted"
--      means any form of electronic, verbal, or written communication sent
--      to the Licensor or its representatives, including but not limited to
--      communication on electronic mailing lists, source code control systems,
--      and issue tracking systems that are managed by, or on behalf of, the
--      Licensor for the purpose of discussing and improving the Work, but
--      excluding communication that is conspicuously marked or otherwise
--      designated in writing by the copyright owner as "Not a Contribution."
--
--      "Contributor" shall mean Licensor and any individual or Legal Entity
--      on behalf of whom a Contribution has been received by Licensor and
--      subsequently incorporated within the Work.
--
--   2. Grant of Copyright License. Subject to the terms and conditions of
--      this License, each Contributor hereby grants to You a perpetual,
--      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
--      copyright license to reproduce, prepare Derivative Works of,
--      publicly display, publicly perform, sublicense, and distribute the
--      Work and such Derivative Works in Source or Object form.
--
--   3. Grant of Patent License. Subject to the terms and conditions of
--      this License, each Contributor hereby grants to You a perpetual,
--      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
--      (except as stated in this section) patent license to make, have made,
--      use, offer to sell, sell, import, and otherwise transfer the Work,
--      where such license applies only to those patent claims licensable
--      by such Contributor that are necessarily infringed by their
--      Contribution(s) alone or by combination of their Contribution(s)
--      with the Work to which such Contribution(s) was submitted. If You
--      institute patent litigation against any entity (including a
--      cross-claim or counterclaim in a lawsuit) alleging that the Work
--      or a Contribution incorporated within the Work constitutes direct
--      or contributory patent infringement, then any patent licenses
--      granted to You under this License for that Work shall terminate
--      as of the date such litigation is filed.
--
--   4. Redistribution. You may reproduce and distribute copies of the
--      Work or Derivative Works thereof in any medium, with or without
--      modifications, and in Source or Object form, provided that You
--      meet the following conditions:
--
--      (a) You must give any other recipients of the Work or
--          Derivative Works a copy of this License; and
--
--      (b) You must cause any modified files to carry prominent notices
--          stating that You changed the files; and
--
--      (c) You must retain, in the Source form of any Derivative Works
--          that You distribute, all copyright, patent, trademark, and
--          attribution notices from the Source form of the Work,
--          excluding those notices that do not pertain to any part of
--          the Derivative Works; and
--
--      (d) If the Work includes a "NOTICE" text file as part of its
--          distribution, then any Derivative Works that You distribute must
--          include a readable copy of the attribution notices contained
--          within such NOTICE file, excluding those notices that do not
--          pertain to any part of the Derivative Works, in at least one
--          of the following places: within a NOTICE text file distributed
--          as part of the Derivative Works; within the Source form or
--          documentation, if provided along with the Derivative Works; or,
--          within a display generated by the Derivative Works, if and
--          wherever such third-party notices normally appear. The contents
--          of the NOTICE file are for informational purposes only and
--          do not modify the License. You may add Your own attribution
--          notices within Derivative Works that You distribute, alongside
--          or as an addendum to the NOTICE text from the Work, provided
--          that such additional attribution notices cannot be construed
--          as modifying the License.
--
--      You may add Your own copyright statement to Your modifications and
--      may provide additional or different license terms and conditions
--      for use, reproduction, or distribution of Your modifications, or
--      for any such Derivative Works as a whole, provided Your use,
--      reproduction, and distribution of the Work otherwise complies with
--      the conditions stated in this License.
--
--   5. Submission of Contributions. Unless You explicitly state otherwise,
--      any Contribution intentionally submitted for inclusion in the Work
--      by You to the Licensor shall be under the terms and conditions of
--      this License, without any additional terms or conditions.
--      Notwithstanding the above, nothing herein shall supersede or modify
--      the terms of any separate license agreement you may have executed
--      with Licensor regarding such Contributions.
--
--   6. Trademarks. This License does not grant permission to use the trade
--      names, trademarks, service marks, or product names of the Licensor,
--      except as required for reasonable and customary use in describing the
--      origin of the Work and reproducing the content of the NOTICE file.
--
--   7. Disclaimer of Warranty. Unless required by applicable law or
--      agreed to in writing, Licensor provides the Work (and each
--      Contributor provides its Contributions) on an "AS IS" BASIS,
--      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
--      implied, including, without limitation, any warranties or conditions
--      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
--      PARTICULAR PURPOSE. You are solely responsible for determining the
--      appropriateness of using or redistributing the Work and assume any
--      risks associated with Your exercise of permissions under this License.
--
--   8. Limitation of Liability. In no event and under no legal theory,
--      whether in tort (including negligence), contract, or otherwise,
--      unless required by applicable law (such as deliberate and grossly
--      negligent acts) or agreed to in writing, shall any Contributor be
--      liable to You for damages, including any direct, indirect, special,
--      incidental, or consequential damages of any character arising as a
--      result of this License or out of the use or inability to use the
--      Work (including but not limited to damages for loss of goodwill,
--      work stoppage, computer failure or malfunction, or any and all
--      other commercial damages or losses), even if such Contributor
--      has been advised of the possibility of such damages.
--
--   9. Accepting Warranty or Additional Liability. While redistributing
--      the Work or Derivative Works thereof, You may choose to offer,
--      and charge a fee for, acceptance of support, warranty, indemnity,
--      or other liability obligations and/or rights consistent with this
--      License. However, in accepting such obligations, You may act only
--      on Your own behalf and on Your sole responsibility, not on behalf
--      of any other Contributor, and only if You agree to indemnify,
--      defend, and hold each Contributor harmless for any liability
--      incurred by, or claims asserted against, such Contributor by reason
--      of your accepting any such warranty or additional liability.
--
--   END OF TERMS AND CONDITIONS
--
--   APPENDIX: How to apply the Apache License to your work.
--
--      To apply the Apache License to your work, attach the following
--      boilerplate notice, with the fields enclosed by brackets "[]"
--      replaced with your own identifying information. (Don't include
--      the brackets!)  The text should be enclosed in the appropriate
--      comment syntax for the file format. We also recommend that a
--      file or class name and description of purpose be included on the
--      same "printed page" as the copyright notice for easier
--      identification within third-party archives.
--
--   Copyright [yyyy] [name of copyright owner]
--
--   Licensed under the Apache License, Version 2.0 (the "License");
--   you may not use this file except in compliance with the License.
--   You may obtain a copy of the License at
--
--       http://www.apache.org/licenses/LICENSE-2.0
--
--   Unless required by applicable law or agreed to in writing, software
--   distributed under the License is distributed on an "AS IS" BASIS,
--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--   See the License for the specific language governing permissions and
--   limitations under the License.
-Binary files ascend-llm/assets/webui.png and ascend-llm-qwen/assets/webui.png differ
-diff -uNr ascend-llm/custom_op/matmul_integer_plugin.cc ascend-llm-qwen/custom_op/matmul_integer_plugin.cc
---- ascend-llm/custom_op/matmul_integer_plugin.cc	2024-09-04 19:21:03.043202000 +0800
-+++ ascend-llm-qwen/custom_op/matmul_integer_plugin.cc	1970-01-01 08:00:00.000000000 +0800
-@@ -1,31 +0,0 @@
--/* Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-- *
-- * This program is free software; you can redistribute it and/or modify
-- * it under the terms of the Apache License Version 2.0.
-- * You may not use this file except in compliance with the License.
-- *
-- * This program is distributed in the hope that it will be useful,
-- * but WITHOUT ANY WARRANTY; without even the implied warranty of
-- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- * Apache License for more details at
-- * http://www.apache.org/licenses/LICENSE-2.0
-- */
--
--#include "register/register.h"
--
--namespace domi {
--Status ParseParamsMatmulInteger(const ge::Operator& op_src, ge::Operator& op_dest) {
--  return SUCCESS;
--}
--
--REGISTER_CUSTOM_OP("BatchMatMulV2")
--    .FrameworkType(ONNX)
--    .OriginOpType({ge::AscendString("ai.onnx::14::MatMulInteger"),
--                   ge::AscendString("ai.onnx::15::MatMulInteger"),
--                   ge::AscendString("ai.onnx::10::MatMulInteger"),
--                   ge::AscendString("ai.onnx::11::MatMulInteger"),
--                   ge::AscendString("ai.onnx::12::MatMulInteger"),
--                   ge::AscendString("ai.onnx::13::MatMulInteger")})
--    .ParseParamsByOperatorFn(ParseParamsMatmulInteger)
--    .ImplyType(ImplyType::TVM);
--}  // namespace domi
-Binary files ascend-llm/export_llama/act_scales/llama-2-7b.pt and ascend-llm-qwen/export_llama/act_scales/llama-2-7b.pt differ
-Binary files ascend-llm/export_llama/act_scales/tiny-llama.pt and ascend-llm-qwen/export_llama/act_scales/tiny-llama.pt differ
-diff -uNr ascend-llm/export_llama/change_node.py ascend-llm-qwen/export_llama/change_node.py
---- ascend-llm/export_llama/change_node.py	2024-09-04 19:21:03.074127700 +0800
-+++ ascend-llm-qwen/export_llama/change_node.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,55 +0,0 @@
--import argparse
--import onnx
--import onnx.helper as helper
--from onnx import TensorProto
--def change_node(in_path,out_path):
--	model = onnx.load(in_path)
--	new_nodes = []
--
--	for node in model.graph.node:
--		# 判断节点类型
--		new_node = node
--		if node.op_type == "Cast":
--			# 替换为新的算子类型, 昇腾Cast fp16 -> int8 有精度问题，暂时用AscendQuant
--			to_attribute = next(attr for attr in node.attribute if attr.name == "to")
--			if to_attribute.i == TensorProto.INT8:
--				new_node = helper.make_node(
--					"AscendQuant",
--					inputs=node.input,
--					outputs=node.output,
--					offset=0.,
--					scale=1.,
--				)
--		new_nodes.append(new_node)
--
--	new_graph = helper.make_graph(
--		new_nodes,
--		"new_graph",
--		inputs=model.graph.input,
--		outputs=model.graph.output,
--		value_info=model.graph.value_info,
--		initializer=model.graph.initializer
--	)
--
--	new_model = helper.make_model(new_graph, producer_name=model.producer_name,opset_imports=model.opset_import,ir_version = model.ir_version)
--	# new_model.ir_version = model.ir_version
--	# new_model.opset_import = model.opset_import
--	# new_model.metadata_props = model.metadata_props
--	onnx.save(new_model, out_path,save_as_external_data=True,size_threshold=0,convert_attribute=True)
--
--if __name__ == "__main__":
--    parser = argparse.ArgumentParser()
--    parser.add_argument(
--        "--input", 
--		type=str,
--		default="./model/export_out/tiny-llama.onnx",
--		help="path to onnx model that need to be processed"
--    )
--    parser.add_argument(
--        "--output",
--        type=str,
--        default="./model/change_node_out/tiny-llama.onnx",
--        help="where to save new onnx model",
--    )
--    args = parser.parse_args()
--    change_node(args.input,args.output)
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/config/no.py ascend-llm-qwen/export_llama/config/no.py
---- ascend-llm/export_llama/config/no.py	2024-09-04 19:21:03.074127700 +0800
-+++ ascend-llm-qwen/export_llama/config/no.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,3 +0,0 @@
--# 不进行量化
--def get(model_cfg,act_max):
--	return {} 
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/config/sd.py ascend-llm-qwen/export_llama/config/sd.py
---- ascend-llm/export_llama/config/sd.py	2024-09-04 19:21:03.074127700 +0800
-+++ ascend-llm-qwen/export_llama/config/sd.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,21 +0,0 @@
--# 静态混合精度分解
--def get(model_cfg,act_max):
--    quant_cfg = {}
--    h_mx,d_mx = findN(0.04 * model_cfg.hidden_size),findN(0.1 * model_cfg.intermediate_size)
--    scale,step = 4, 4/model_cfg.num_hidden_layers
--    for i in range(model_cfg.num_hidden_layers):
--        scale = max(0,scale-step)
--        h_cur,d_cur = max(16,h_mx >> int(scale)), max(32,d_mx >> int(scale))
--        for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]:
--            quant_cfg[str(i)+"."+name] = {"type":"W8SD","act_scale":True,"alpha":h_cur}
--        quant_cfg[str(i)+".down_proj"] = {"type":"W8SD","act_scale":True,"alpha":d_cur}
--    quant_cfg["lm_head"] = {"type":"W8SD"}
--    quant_cfg["act_scales_path"] = act_max
--    return quant_cfg
--
--def findN(N):
--    sum = 1; 
--    while True:
--        if sum * 2 > N:
--            return sum	
--        sum = sum * 2
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/config/smooth.py ascend-llm-qwen/export_llama/config/smooth.py
---- ascend-llm/export_llama/config/smooth.py	2024-09-04 19:21:03.075092900 +0800
-+++ ascend-llm-qwen/export_llama/config/smooth.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,13 +0,0 @@
--# 平滑激活
--def get(model_cfg,act_max):
--	quant_cfg = {}
--	for i in range(model_cfg.num_hidden_layers):
--		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]:
--			quant_cfg[str(i)+"."+name] = {"type":"W8X8"}
--		# 对某一个具体的层加act_scale的作用： 若为W8X8，则对该层进行smooth；如为W8SD，则用act_scale进行混合精度分解。
--		quant_cfg[str(i)+".down_proj"] = {"type":"W8X8","act_scale":True,"alpha":0.85} 
--	quant_cfg["lm_head"] = {"type":"W8X8","act_scale":True,"alpha":0.85}
--	quant_cfg["act_scales_path"] = act_max
--	quant_cfg["alpha"] = 0.85  # smoothquant 迁移系数
--	quant_cfg["smooth"] = True # 整体的smooth控制是将激活值的缩放与RMSNorm融合，不会造成额外的开销，但down_proj层无法使用
--	return quant_cfg
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/config/smsd.py ascend-llm-qwen/export_llama/config/smsd.py
---- ascend-llm/export_llama/config/smsd.py	2024-09-04 19:21:03.075092900 +0800
-+++ ascend-llm-qwen/export_llama/config/smsd.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,22 +0,0 @@
--# 对down_proj混合精度分解，对其他部分平滑激活
--def get(model_cfg,act_max):
--    quant_cfg = {}
--    d_mx = findN(0.1 * model_cfg.intermediate_size)
--    scale,step = 4, 4/model_cfg.num_hidden_layers
--    for i in range(model_cfg.num_hidden_layers):
--        scale = max(0,scale-step)
--        d_cur = max(32,d_mx >> int(scale))
--        for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]:
--            quant_cfg[str(i)+"."+name] = {"type":"W8X8"}
--        quant_cfg[str(i)+".down_proj"] = {"type":"W8SD","act_scale":True,"alpha":d_cur}
--    quant_cfg["lm_head"] = {"type":"W8SD","act_scale":True,"alpha":64}
--    quant_cfg["act_scales_path"] = act_max
--    quant_cfg["smooth"] = True
--    return quant_cfg
--
--def findN(N):
--    sum = 1; 
--    while True:
--        if sum * 2 > N:
--            return sum	
--        sum = sum * 2
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/config/w8.py ascend-llm-qwen/export_llama/config/w8.py
---- ascend-llm/export_llama/config/w8.py	2024-09-04 19:21:03.075092900 +0800
-+++ ascend-llm-qwen/export_llama/config/w8.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,8 +0,0 @@
--# 仅权重int8量化
--def get(model_cfg,act_max):
--	quant_cfg = {}
--	for i in range(model_cfg.num_hidden_layers):
--		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]:
--			quant_cfg[str(i)+"."+name] = {"type":"W8"}
--	quant_cfg["lm_head"] = {"type":"W8"}
--	return quant_cfg
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/config/w8dx.py ascend-llm-qwen/export_llama/config/w8dx.py
---- ascend-llm/export_llama/config/w8dx.py	2024-09-04 19:21:03.075092900 +0800
-+++ ascend-llm-qwen/export_llama/config/w8dx.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,10 +0,0 @@
--# 动态混合精度分解
--def get(model_cfg,act_max):
--	quant_cfg = {}
--	for i in range(model_cfg.num_hidden_layers):
--		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]:
--			quant_cfg[str(i)+"."+name] = {"type":"W8DX"}
--	# quant_cfg["lm_head"] = {"type":"W8DX"}  # 可以根据需要取消注释
--	# quant_cfg["act_scales_path"] = act_max # 可以根据需要取消注释
--	# quant_cfg["smooth"] = True # 可以根据需要取消注释
--	return quant_cfg
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/config/w8x8.py ascend-llm-qwen/export_llama/config/w8x8.py
---- ascend-llm/export_llama/config/w8x8.py	2024-09-04 19:21:03.075092900 +0800
-+++ ascend-llm-qwen/export_llama/config/w8x8.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,8 +0,0 @@
--# per-token absmax量化
--def get(model_cfg,act_max):
--	quant_cfg = {}
--	for i in range(model_cfg.num_hidden_layers):
--		for name in ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]:
--			quant_cfg[str(i)+"."+name] = {"type":"W8X8"}
--	quant_cfg["lm_head"] = {"type":"W8X8"}
--	return quant_cfg
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/eval.py ascend-llm-qwen/export_llama/eval.py
---- ascend-llm/export_llama/eval.py	2024-09-04 19:21:03.076083700 +0800
-+++ ascend-llm-qwen/export_llama/eval.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,161 +0,0 @@
--import argparse
--import importlib
--import json
--import lm_eval
--from lm_eval.models.huggingface import HFLM
--from lm_eval.utils import make_table
--import torch
--import tqdm
--from datasets import load_dataset
--from transformers import AutoTokenizer,AutoModelForCausalLM
--import datetime
--
--print_ = print
--
--def lm_eval_fn(args):
--    global print_
--    lm_obj = HFLM(pretrained=args.model,tokenizer=args.tokenizer, batch_size="auto")
--    task_manager = lm_eval.tasks.TaskManager()
--
--    results = lm_eval.simple_evaluate( # call simple_evaluate
--        model=lm_obj,
--        tasks=args.tasks,
--        num_fewshot=0,
--        task_manager=task_manager,
--    )
--    # now = datetime.datetime.now()
--    # with open(f'eval-{now.month:02}-{now.day:02}-{now.hour:02}:{now.minute:02}:{now.second:02}.json', 'w') as f:
--    #     json.dump(results, f)
--    print_(make_table(results))    
--
--def ppl_eval_fn(args):
--    global print_
--    def evaluate_perplexity(model, tokenizer,dataset):
--        def _perplexity(nlls, n_samples, seqlen):
--            return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))
--        data = None
--        if dataset == "wikitext":
--            data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
--            data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
--        elif dataset == "c4":
--            data = load_dataset('allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
--            data = tokenizer(" ".join(data[:]['text']), return_tensors="pt")
--        else:
--            raise f"Not support ppl eval dataset:{dataset}"
--        data = data.input_ids.to(model.device)
--        seqlen = 2048
--        model = model.eval()
--        n_samples = data.numel() // seqlen
--
--        nlls = []
--
--        with tqdm.tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
--            for i in progress_bar:
--                start_index = i * seqlen
--                end_index = (i + 1) * seqlen
--                batch = data[:, start_index:end_index].to(model.device)
--                with torch.no_grad():
--                    logits = model(batch).logits
--                shift_logits = logits[:, :-1, :].contiguous().float()
--                shift_labels = data[:, start_index:end_index][:, 1:]
--                loss_fct = torch.nn.CrossEntropyLoss()
--                loss = loss_fct(
--                    shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
--                )
--                neg_log_likelihood = loss.float() * seqlen
--                nlls.append(neg_log_likelihood)
--
--                curr_ppl = _perplexity(nlls, i + 1, seqlen)
--                progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")
--
--        ppl = _perplexity(nlls, n_samples, seqlen)
--        print_(f"Perplexity on {dataset}: {ppl.item()}")
--        return ppl.item()
--    for dataset in args.datasets:
--        print_(f"\n-----------------begin test ppl on dataset {dataset}-------------------\n")
--        evaluate_perplexity(args.model,args.tokenizer,dataset)
--
--def run_test(args,title:str):
--    global print_
--    print_(f"\n-------------------------{title}-----------------------------\n")
--    if "ppl" in args.tasks:
--        ppl_eval_fn(args)
--        args.tasks.remove("ppl")
--    if len(args.tasks) != 0:
--        lm_eval_fn(args)
--
--def parse_args():
--    now = datetime.datetime.now()
--    parser = argparse.ArgumentParser()
--    parser.add_argument(
--        "--model","-m", 
--        type=str, default="./model/TinyLlama-1.1B-Chat-v1.0",
--        help="path to model or hugging face model id"
--    )
--    parser.add_argument(
--        "--output","-o",
--        type=str,
--        default=f"./result-{now.month:02}-{now.day:02}-{now.hour:02}:{now.minute:02}:{now.second:02}.log",
--        help="where to save eval result",
--    )
--    parser.add_argument(
--        "--datasets","-d",
--        type=str,
--        default="wikitext,c4",
--        help=" the dataset used to eval perplexity",
--    )
--    parser.add_argument(
--        "--tasks","-t",
--        type=str,
--        default="mmlu,ppl,lambada_openai,boolq,arc_easy,arc_challenge,piqa,winogrande",
--        help="tasks parameter for lm-evaluation-harness",
--    )
--    parser.add_argument(
--        "--act-path","-a",
--        type=str,
--        default="./act_scales/llama-2-7b.pt",
--        help="path to act_scales",
--    )
--    parser.add_argument(
--        "--quant","-q",
--        type=str,
--        default="./config/w8x8.py",
--        help="path to quant config",
--    )
--    return parser.parse_args()
--
--
--def main():
--    import os
--    os.chdir(os.path.dirname(__file__))
--    args = parse_args()
--    args.datasets = args.datasets.split(",")
--    model_name = args.model.split("/")[-1]
--    setattr(args,"tokenizer",AutoTokenizer.from_pretrained(args.model))
--    setattr(args,"model",AutoModelForCausalLM.\
--            from_pretrained(args.model,torch_dtype=torch.float16,device_map="auto"))
--    args.model.eval()
--    out_f = open(args.output,"w")
--    def print_fn(*value:object,sep=" ",end="\n",file=None,flush=False):
--        out_f.write(sep.join([str(v) for v in value])+end)
--        print(*value,sep=sep,end=end,file=file,flush=flush)
--    global print_
--    print_ = print_fn
--    args.tasks = args.tasks.split(",")
--    flag = "ppl" not in args.tasks
--    run_test(args,f"test {model_name}")
--    args.tasks = args.tasks if flag else (args.tasks + ["ppl"])
--    # quantize 
--    model_cfg=args.model.model.config
--    spec = importlib.util.spec_from_file_location("quant_cfg_module", args.quant)
--    quant_cfg_module = importlib.util.module_from_spec(spec)
--    spec.loader.exec_module(quant_cfg_module)
--    quantize_cfg = quant_cfg_module.get(model_cfg,args.act_path)
--    from quantize import quantize
--    quantize(args.model,quantize_cfg)
--    
--    run_test(args,f"test quantized {model_name}")
--    out_f.close()
--
--if __name__ == "__main__":
--    main()
-\ No newline at end of file
 diff -uNr ascend-llm/export_llama/export_llama.py ascend-llm-qwen/export_llama/export_llama.py
---- ascend-llm/export_llama/export_llama.py	2024-09-04 19:49:56.471989100 +0800
-+++ ascend-llm-qwen/export_llama/export_llama.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,97 +0,0 @@
--import argparse
--import importlib
--import torch
--import os
+--- ascend-llm/export_llama/export_llama.py	2024-09-05 15:10:55.831311000 +0800
++++ ascend-llm-qwen/export_llama/export_llama.py	2024-09-05 15:20:09.720307600 +0800
+@@ -2,16 +2,17 @@
+ import importlib
+ import torch
+ import os
 -from transformers import LlamaForCausalLM, LlamaTokenizer
 -
--
--def export_onnx(base_model,out_path,quant_cfg_path,act_path):
++from transformers import Qwen2ForCausalLM, Qwen2Tokenizer
++import torch_npu
++from torch_npu.contrib import transfer_to_npu
++torch_npu.npu.set_device("npu:3")
+ 
+ def export_onnx(base_model,out_path,quant_cfg_path,act_path):
 -    tokenizer= LlamaTokenizer.from_pretrained(base_model)
 -    model = LlamaForCausalLM.from_pretrained(
--        base_model,
--        torch_dtype=torch.float16,
++    tokenizer= Qwen2Tokenizer.from_pretrained(base_model)
++    model = Qwen2ForCausalLM.from_pretrained(
+         base_model,
+         torch_dtype=torch.float16,
 -        device_map="auto",
 -    )
--    model_cfg=model.model.config
--    spec = importlib.util.spec_from_file_location("quant_cfg_module", quant_cfg_path)
--    quant_cfg_module = importlib.util.module_from_spec(spec)
--    spec.loader.exec_module(quant_cfg_module)
--    quantize_cfg = quant_cfg_module.get(model_cfg,act_path)
--    from quantize import quantize
--    quantize(model,quantize_cfg)
--    
--    input_names = ["input_ids", "attention_mask", "position_ids","past_key_values"]
--    output_names = ["logits","out_key_values","attn_scores"]
--    dynamic_axes = {
--        "input_ids": { 0: "batch_size", 1: "seq_length" },
--        "attention_mask": { 0: "batch_size",1:"all_len" },
--        "position_ids": { 0: "batch_size", 1: "seq_length" },
--        "past_key_values": { 2: "batch_size", 4: "kv_len" },
--    }
--    
--    batch_size,seq_len,kv_len=1,16,1024
--    all_len = seq_len + kv_len
--    n_layers,n_heads,hidden_size=model_cfg.num_hidden_layers,model_cfg.num_key_value_heads,model_cfg.hidden_size
--    head_dim = int(model_cfg.hidden_size / model_cfg.num_attention_heads)
--
--
--    input_ids = torch.zeros((batch_size,seq_len)).long().to("cuda") # batch_size, new_sequence_length
--    attention_mask = torch.zeros((batch_size,all_len)).long().to("cuda") # batch_size, all_sequence_length
--    position_ids = torch.zeros((batch_size,seq_len)).long().to("cuda") # batch_size, new_sequence_length
--    # past_keys = torch.rand((batch_size,  n_heads,kv_len, head_dim),dtype=torch.float16).to("cuda")
--    # past_values = torch.rand((batch_size,n_heads, kv_len, head_dim),dtype=torch.float16).to("cuda")
--    # past_key_values = tuple([(past_keys,past_values)] * n_layers)
--    past_key_values = torch.rand((n_layers,2,batch_size,n_heads, kv_len, head_dim),dtype=torch.float16).to("cuda")
--    input_args = (
--        input_ids,
--        attention_mask,
--        position_ids,
--        past_key_values,
--        None, # inputs_embeds: Optional[torch.FloatTensor] = None,
--        None, #labels: Optional[torch.LongTensor] = None,
--        True, #use_cache: Optional[bool] = None,
--        True # output_attentions: Optional[bool] = None,
--    )
--
--    model.eval()
--    torch.onnx.export(
--        model,
--        f=out_path,
--        args=input_args,
--        input_names=input_names,
--        output_names=output_names,
--        dynamic_axes=dynamic_axes,
--        opset_version=13,
--        export_params=True,
--    )
--
--if __name__ == "__main__":
--    import os
--    os.chdir(os.path.dirname(__file__))
--    parser = argparse.ArgumentParser()
--    parser.add_argument(
--        "--model", "-m",
--        type=str, 
--        default="./model/TinyLlama-1.1B-Chat-v1.0", 
--        help="transformers model"
--    )
--    parser.add_argument(
--        "--output","-o",
--        type=str,
--        default="./model/export_out/tiny-llama.onnx",
--        help="where to save onnx model",
--    )
--    parser.add_argument(
--        "--act-path","-a",
--        type=str,
--        default="./act_scales/llama-2-7b.pt",
--        help="path to act_scales",
--    )
--    parser.add_argument(
--        "--quant","-q",
--        type=str,
--        default="./config/w8x8.py",
--        help="path to quant config",
--    )
--    args = parser.parse_args()
--    export_onnx(args.model,args.output,args.quant,args.act_path)
-diff -uNr ascend-llm/export_llama/generate_act_scales.py ascend-llm-qwen/export_llama/generate_act_scales.py
---- ascend-llm/export_llama/generate_act_scales.py	2024-09-04 19:21:03.076083700 +0800
-+++ ascend-llm-qwen/export_llama/generate_act_scales.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,109 +0,0 @@
--'''
--code from https://github.com/mit-han-lab/smoothquant/
--'''
--from datasets import load_dataset
--import functools
--from collections import defaultdict
--
--from functools import partial
--import numpy as np
--from tqdm import tqdm
--import torch
--import os
--
--from transformers import (
--    AutoModelForCausalLM,
--    AutoTokenizer,
--)
--import argparse
--
--def get_act_scales(model, tokenizer, dataset_path, num_samples=512, seq_len=512):
--    model.eval()
--    device = next(model.parameters()).device
--    act_scales = {}
--
--    def stat_tensor(name, tensor):
--        hidden_dim = tensor.shape[-1]
--        tensor = tensor.view(-1, hidden_dim).abs().detach()
--        comming_max = torch.max(tensor, dim=0)[0].float().cpu()
--        if name in act_scales:
--            act_scales[name] = torch.max(act_scales[name], comming_max)
--        else:
--            act_scales[name] = comming_max
--
--    def stat_input_hook(m, x, y, name):
--        if isinstance(x, tuple):
--            x = x[0]
--        stat_tensor(name, x)
--
--    hooks = []
--    for name, m in model.named_modules():
--        if isinstance(m,torch.nn.Linear):
--            hooks.append(
--                m.register_forward_hook(functools.partial(stat_input_hook, name=name))
--            )
--
--    dataset = load_dataset("json", data_files=dataset_path,split="train")
--    dataset = dataset.shuffle(seed=42)
--
--    for i in tqdm(range(num_samples)):
--        text = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(
--			instruction=dataset["instruction"][i], input=dataset["output"][i]
--		)
--        input_ids = tokenizer(
--            text, return_tensors="pt", max_length=seq_len, truncation=True
--        ).input_ids.to(device)
--        model(input_ids)
--
--    for h in hooks:
--        h.remove()
--
--    return act_scales
--
--
--
--def build_model_and_tokenizer(model_name):
--    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
--    kwargs = {"torch_dtype": torch.float16, "device_map": "sequential"}
--    model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
--    return model, tokenizer
--
--
--def parse_args():
--    parser = argparse.ArgumentParser()
--    parser.add_argument(
--        "--model-name", type=str, default="/run/llama-chat-7b-hf", help="model name"
--    )
--    parser.add_argument(
--        "--output-path",
--        type=str,
--        default="act_scales/opt-1.3b.pt",
--        help="where to save the act scales",
--    )
--    parser.add_argument(
--        "--dataset-path",
--        type=str,
--        default="/root/zanilia/alpaca-lora/alpaca_data.json",
--        help="location of the calibration dataset, we use the validation set of the Pile dataset",
--    )
--    parser.add_argument("--num-samples", type=int, default=512)
--    parser.add_argument("--seq-len", type=int, default=512)
--    args = parser.parse_args()
--    return args
--
--
--@torch.no_grad()
--def main():
--    args = parse_args()
--    model, tokenizer = build_model_and_tokenizer(args.model_name)
--
--    act_scales = get_act_scales(
--        model, tokenizer, args.dataset_path, args.num_samples, args.seq_len
--    )
--
--    os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
--    torch.save(act_scales, args.output_path)
--
--
--if __name__ == "__main__":
--    main()
-diff -uNr ascend-llm/export_llama/modeling_llama_4.35.py ascend-llm-qwen/export_llama/modeling_llama_4.35.py
---- ascend-llm/export_llama/modeling_llama_4.35.py	2024-09-04 19:21:03.078081000 +0800
-+++ ascend-llm-qwen/export_llama/modeling_llama_4.35.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,1264 +0,0 @@
--# coding=utf-8
--# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
--#
--# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
--# and OPT implementations in this library. It has been modified from its
--# original forms to accommodate minor architectural differences compared
--# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
--#
--# Licensed under the Apache License, Version 2.0 (the "License");
--# you may not use this file except in compliance with the License.
--# You may obtain a copy of the License at
--#
--#     http://www.apache.org/licenses/LICENSE-2.0
--#
--# Unless required by applicable law or agreed to in writing, software
--# distributed under the License is distributed on an "AS IS" BASIS,
--# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--# See the License for the specific language governing permissions and
--# limitations under the License.
--""" PyTorch LLaMA model."""
--import math
--import warnings
--from typing import List, Optional, Tuple, Union
--
--import torch
--import torch.nn.functional as F
--import torch.utils.checkpoint
--from torch import nn
--from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
--
--from ...activations import ACT2FN
--from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
--from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
--from ...modeling_utils import PreTrainedModel
--from ...pytorch_utils import ALL_LAYERNORM_LAYERS
--from ...utils import (
--    add_start_docstrings,
--    add_start_docstrings_to_model_forward,
--    is_flash_attn_2_available,
--    logging,
--    replace_return_docstrings,
--)
--from ...utils.import_utils import is_torch_fx_available
--from .configuration_llama import LlamaConfig
--
--
--if is_flash_attn_2_available():
--    from flash_attn import flash_attn_func, flash_attn_varlen_func
--    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
--
--
--# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
--# It means that the function will not be traced through and simply appear as a node in the graph.
--if is_torch_fx_available():
--    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
--
--
--logger = logging.get_logger(__name__)
--
--_CONFIG_FOR_DOC = "LlamaConfig"
--
--
--def _get_unpad_data(attention_mask):
--    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
--    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
--    max_seqlen_in_batch = seqlens_in_batch.max().item()
--    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
--    return (
--        indices,
--        cu_seqlens,
--        max_seqlen_in_batch,
--    )
--
--
--def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
--    warnings.warn(
--        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask"
--    )
--    return AttentionMaskConverter._prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
--
--
--def _make_causal_mask(
--    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
--):
--    warnings.warn(
--        "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask"
--    )
--    return AttentionMaskConverter._make_causal_mask(
--        input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
--    )
--
--
--class LlamaRMSNorm(nn.Module):
--    def __init__(self, hidden_size, eps=1e-6):
--        """
--        LlamaRMSNorm is equivalent to T5LayerNorm
--        """
--        super().__init__()
--        self.weight = nn.Parameter(torch.ones(hidden_size))
--        self.variance_epsilon = eps
--
--    def forward(self, hidden_states):
--        input_dtype = hidden_states.dtype
--        hidden_states = hidden_states.to(torch.float32)
--        variance = hidden_states.pow(2).mean(-1, keepdim=True)
--        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
--        return self.weight * hidden_states.to(input_dtype)
--
--
--ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
--
--
--class LlamaRotaryEmbedding(nn.Module):
--    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
--        super().__init__()
--
--        self.dim = dim
--        self.max_position_embeddings = max_position_embeddings
--        self.base = base
--        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
--        self.register_buffer("inv_freq", inv_freq, persistent=False)
--
--        # Build here to make `torch.jit.trace` work.
--        self._set_cos_sin_cache(
--            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
--        )
--
--    def _set_cos_sin_cache(self, seq_len, device, dtype):
--        self.max_seq_len_cached = seq_len
--        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
--
--        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
--        # Different from paper, but it uses a different permutation in order to obtain the same calculation
--        emb = torch.cat((freqs, freqs), dim=-1)
--        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
--        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
--
--    def forward(self, x, seq_len=None):
--        return (
--            self.cos_cached.to(dtype=x.dtype),
--            self.sin_cached.to(dtype=x.dtype),
--        )
--        # x: [bs, num_attention_heads, seq_len, head_size]
--        if seq_len > self.max_seq_len_cached:
--            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
--
--        return (
--            self.cos_cached[:seq_len].to(dtype=x.dtype),
--            self.sin_cached[:seq_len].to(dtype=x.dtype),
--        )
--
--
--class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
--    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
--
--    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
--        self.scaling_factor = scaling_factor
--        super().__init__(dim, max_position_embeddings, base, device)
--
--    def _set_cos_sin_cache(self, seq_len, device, dtype):
--        self.max_seq_len_cached = seq_len
--        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
--        t = t / self.scaling_factor
--
--        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
--        # Different from paper, but it uses a different permutation in order to obtain the same calculation
--        emb = torch.cat((freqs, freqs), dim=-1)
--        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
--        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
--
--
--class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
--    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
--
--    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
--        self.scaling_factor = scaling_factor
--        super().__init__(dim, max_position_embeddings, base, device)
--
--    def _set_cos_sin_cache(self, seq_len, device, dtype):
--        self.max_seq_len_cached = seq_len
--
--        if seq_len > self.max_position_embeddings:
--            base = self.base * (
--                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
--            ) ** (self.dim / (self.dim - 2))
--            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
--            self.register_buffer("inv_freq", inv_freq, persistent=False)
--
--        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
--
--        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
--        # Different from paper, but it uses a different permutation in order to obtain the same calculation
--        emb = torch.cat((freqs, freqs), dim=-1)
--        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
--        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
--
--
--def rotate_half(x):
--    """Rotates half the hidden dims of the input."""
--    x1 = x[..., : x.shape[-1] // 2]
--    x2 = x[..., x.shape[-1] // 2 :]
--    return torch.cat((-x2, x1), dim=-1)
--
--
--def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
--    """Applies Rotary Position Embedding to the query and key tensors.
--
--    Args:
--        q (`torch.Tensor`): The query tensor.
--        k (`torch.Tensor`): The key tensor.
--        cos (`torch.Tensor`): The cosine part of the rotary embedding.
--        sin (`torch.Tensor`): The sine part of the rotary embedding.
--        position_ids (`torch.Tensor`):
--            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
--            used to pass offsetted position ids when working with a KV-cache.
--        unsqueeze_dim (`int`, *optional*, defaults to 1):
--            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
--            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
--            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
--            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
--            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
--            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
--    Returns:
--        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
--    """
--    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
--    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
--    q_embed = (q * cos) + (rotate_half(q) * sin)
--    k_embed = (k * cos) + (rotate_half(k) * sin)
--    return q_embed, k_embed
--
--
--class LlamaMLP(nn.Module):
--    def __init__(self, config):
--        super().__init__()
--        self.config = config
--        self.hidden_size = config.hidden_size
--        self.intermediate_size = config.intermediate_size
--        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
--        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
--        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
--        self.act_fn = ACT2FN[config.hidden_act]
--
--    def forward(self, x):
--        if self.config.pretraining_tp > 1:
--            slice = self.intermediate_size // self.config.pretraining_tp
--            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
--            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
--            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
--
--            gate_proj = torch.cat(
--                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
--            )
--            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
--
--            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
--            down_proj = [
--                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
--            ]
--            down_proj = sum(down_proj)
--        else:
--            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
--
--        return down_proj
--
--
--def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
--    """
--    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
--    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
--    """
--    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
--    if n_rep == 1:
--        return hidden_states
--    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
--    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
--
--
--class LlamaAttention(nn.Module):
--    """Multi-headed attention from 'Attention Is All You Need' paper"""
--
--    def __init__(self, config: LlamaConfig):
--        super().__init__()
--        self.config = config
--        self.hidden_size = config.hidden_size
--        self.num_heads = config.num_attention_heads
--        self.head_dim = self.hidden_size // self.num_heads
--        self.num_key_value_heads = config.num_key_value_heads
--        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
--        self.max_position_embeddings = config.max_position_embeddings
--        self.rope_theta = config.rope_theta
--        self.is_causal = True
--
--        if (self.head_dim * self.num_heads) != self.hidden_size:
--            raise ValueError(
--                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
--                f" and `num_heads`: {self.num_heads})."
--            )
--        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
--        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
--        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
--        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
--        self._init_rope()
--
--    def _init_rope(self):
--        if self.config.rope_scaling is None:
--            self.rotary_emb = LlamaRotaryEmbedding(
--                self.head_dim,
--                max_position_embeddings=self.max_position_embeddings,
--                base=self.rope_theta,
--            )
--        else:
--            scaling_type = self.config.rope_scaling["type"]
--            scaling_factor = self.config.rope_scaling["factor"]
--            if scaling_type == "linear":
--                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
--                    self.head_dim,
--                    max_position_embeddings=self.max_position_embeddings,
--                    scaling_factor=scaling_factor,
--                    base=self.rope_theta,
--                )
--            elif scaling_type == "dynamic":
--                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
--                    self.head_dim,
--                    max_position_embeddings=self.max_position_embeddings,
--                    scaling_factor=scaling_factor,
--                    base=self.rope_theta,
--                )
--            else:
--                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
--
--    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
--        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
--
--    def forward(
--        self,
--        hidden_states: torch.Tensor,
--        attention_mask: Optional[torch.Tensor] = None,
--        position_ids: Optional[torch.LongTensor] = None,
--        past_key_value: Optional[Tuple[torch.Tensor]] = None,
--        output_attentions: bool = False,
--        use_cache: bool = False,
--        **kwargs,
--    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
--        if "padding_mask" in kwargs:
--            warnings.warn(
--                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
--            )
--
--        bsz, q_len, _ = hidden_states.size()
--
--        if self.config.pretraining_tp > 1:
--            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
--            query_slices = self.q_proj.weight.split(
--                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
--            )
--            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
--            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
--
--            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
--            query_states = torch.cat(query_states, dim=-1)
--
--            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
--            key_states = torch.cat(key_states, dim=-1)
--
--            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
--            value_states = torch.cat(value_states, dim=-1)
--
--        else:
--            query_states = self.q_proj(hidden_states)
--            key_states = self.k_proj(hidden_states)
--            value_states = self.v_proj(hidden_states)
--
--        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
--        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
--        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
--
--        kv_seq_len = key_states.shape[-2]
--        if past_key_value is not None:
--            kv_seq_len += past_key_value[0].shape[-2]
--        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
--        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
--
--        out_key_value = (key_states, value_states) if use_cache else None
--
--        if past_key_value is not None:
--            # reuse k, v, self_attention
--            key_states = torch.cat([past_key_value[0], key_states], dim=2)
--            value_states = torch.cat([past_key_value[1], value_states], dim=2)
--
--
--        key_states = repeat_kv(key_states, self.num_key_value_groups)
--        value_states = repeat_kv(value_states, self.num_key_value_groups)
--
--        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
--
--        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
--            raise ValueError(
--                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
--                f" {attn_weights.size()}"
--            )
--
--        if attention_mask is not None:
--            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
--                raise ValueError(
--                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
--                )
--            attn_weights = attn_weights + attention_mask
--
--        # upcast attention to fp32
--        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
--        attn_output = torch.matmul(attn_weights, value_states)
--
--        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
--            raise ValueError(
--                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
--                f" {attn_output.size()}"
--            )
--
--        attn_output = attn_output.transpose(1, 2).contiguous()
--
--        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
--
--        if self.config.pretraining_tp > 1:
--            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
--            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
--            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
--        else:
--            attn_output = self.o_proj(attn_output)
--
--        if not output_attentions:
--            attn_weights = None
--
--        return attn_output, attn_weights, out_key_value
--
--
--class LlamaFlashAttention2(LlamaAttention):
--    """
--    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
--    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
--    flash attention and deal with padding tokens in case the input contains any of them.
--    """
--
--    def forward(
--        self,
--        hidden_states: torch.Tensor,
--        attention_mask: Optional[torch.LongTensor] = None,
--        position_ids: Optional[torch.LongTensor] = None,
--        past_key_value: Optional[Tuple[torch.Tensor]] = None,
--        output_attentions: bool = False,
--        use_cache: bool = False,
--        **kwargs,
--    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
--        # LlamaFlashAttention2 attention does not support output_attentions
--        if "padding_mask" in kwargs:
--            warnings.warn(
--                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
--            )
--
--            # overwrite attention_mask with padding_mask
--            attention_mask = kwargs.pop("padding_mask")
--
--        output_attentions = False
--
--        bsz, q_len, _ = hidden_states.size()
--
--        query_states = self.q_proj(hidden_states)
--        key_states = self.k_proj(hidden_states)
--        value_states = self.v_proj(hidden_states)
--
--        # Flash attention requires the input to have the shape
--        # batch_size x seq_length x head_dim x hidden_dim
--        # therefore we just need to keep the original shape
--        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
--        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
--        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
--
--        kv_seq_len = key_states.shape[-2]
--        if past_key_value is not None:
--            kv_seq_len += past_key_value[0].shape[-2]
--
--        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
--
--        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
--
--        if past_key_value is not None:
--            # reuse k, v, self_attention
--            key_states = torch.cat([past_key_value[0], key_states], dim=2)
--            value_states = torch.cat([past_key_value[1], value_states], dim=2)
--
--        past_key_value = (key_states, value_states) if use_cache else None
--
--        query_states = query_states.transpose(1, 2)
--        key_states = key_states.transpose(1, 2)
--        value_states = value_states.transpose(1, 2)
--
--        # TODO: llama does not have dropout in the config??
--        # It is recommended to use dropout with FA according to the docs
--        # when training.
--        dropout_rate = 0.0  # if not self.training else self.attn_dropout
--
--        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
--        # therefore the input hidden states gets silently casted in float32. Hence, we need
--        # cast them back in the correct dtype just to be sure everything works as expected.
--        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
--        # in fp32. (LlamaRMSNorm handles it correctly)
--
--        input_dtype = query_states.dtype
--        if input_dtype == torch.float32:
--            # Handle the case where the model is quantized
--            if hasattr(self.config, "_pre_quantization_dtype"):
--                target_dtype = self.config._pre_quantization_dtype
--            else:
--                target_dtype = self.q_proj.weight.dtype
--
--            logger.warning_once(
--                f"The input hidden states seems to be silently casted in float32, this might be related to"
--                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
--                f" {target_dtype}."
--            )
--
--            query_states = query_states.to(target_dtype)
--            key_states = key_states.to(target_dtype)
--            value_states = value_states.to(target_dtype)
--
--        attn_output = self._flash_attention_forward(
--            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
--        )
--
--        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
--        attn_output = self.o_proj(attn_output)
--
--        if not output_attentions:
--            attn_weights = None
--
--        return attn_output, attn_weights, past_key_value
--
--    def _flash_attention_forward(
--        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
--    ):
--        """
--        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
--        first unpad the input, then computes the attention scores and pad the final attention scores.
--
--        Args:
--            query_states (`torch.Tensor`):
--                Input query states to be passed to Flash Attention API
--            key_states (`torch.Tensor`):
--                Input key states to be passed to Flash Attention API
--            value_states (`torch.Tensor`):
--                Input value states to be passed to Flash Attention API
--            attention_mask (`torch.Tensor`):
--                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
--                position of padding tokens and 1 for the position of non-padding tokens.
--            dropout (`int`, *optional*):
--                Attention dropout
--            softmax_scale (`float`, *optional*):
--                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
--        """
--        # Contains at least one padding token in the sequence
--        if attention_mask is not None:
--            batch_size = query_states.shape[0]
--            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
--                query_states, key_states, value_states, attention_mask, query_length
--            )
--
--            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
--            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
--
--            attn_output_unpad = flash_attn_varlen_func(
--                query_states,
--                key_states,
--                value_states,
--                cu_seqlens_q=cu_seqlens_q,
--                cu_seqlens_k=cu_seqlens_k,
--                max_seqlen_q=max_seqlen_in_batch_q,
--                max_seqlen_k=max_seqlen_in_batch_k,
--                dropout_p=dropout,
--                softmax_scale=softmax_scale,
--                causal=self.is_causal,
--            )
--
--            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
--        else:
--            attn_output = flash_attn_func(
--                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
--            )
--
--        return attn_output
--
--    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
--        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
--        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
--
--        key_layer = index_first_axis(
--            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
--        )
--        value_layer = index_first_axis(
--            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
--        )
--        if query_length == kv_seq_len:
--            query_layer = index_first_axis(
--                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
--            )
--            cu_seqlens_q = cu_seqlens_k
--            max_seqlen_in_batch_q = max_seqlen_in_batch_k
--            indices_q = indices_k
--        elif query_length == 1:
--            max_seqlen_in_batch_q = 1
--            cu_seqlens_q = torch.arange(
--                batch_size + 1, dtype=torch.int32, device=query_layer.device
--            )  # There is a memcpy here, that is very bad.
--            indices_q = cu_seqlens_q[:-1]
--            query_layer = query_layer.squeeze(1)
--        else:
--            # The -q_len: slice assumes left padding.
--            attention_mask = attention_mask[:, -query_length:]
--            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
--
--        return (
--            query_layer,
--            key_layer,
--            value_layer,
--            indices_q,
--            (cu_seqlens_q, cu_seqlens_k),
--            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
--        )
--
--
--class LlamaDecoderLayer(nn.Module):
--    def __init__(self, config: LlamaConfig):
--        super().__init__()
--        self.hidden_size = config.hidden_size
--        self.self_attn = (
--            LlamaAttention(config=config)
--            if not getattr(config, "_flash_attn_2_enabled", False)
--            else LlamaFlashAttention2(config=config)
--        )
--        self.mlp = LlamaMLP(config)
--        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
--        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
--
--    def forward(
--        self,
--        hidden_states: torch.Tensor,
--        attention_mask: Optional[torch.Tensor] = None,
--        position_ids: Optional[torch.LongTensor] = None,
--        past_key_value: Optional[Tuple[torch.Tensor]] = None,
--        output_attentions: Optional[bool] = False,
--        use_cache: Optional[bool] = False,
--        **kwargs,
--    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
--        """
--        Args:
--            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
--            attention_mask (`torch.FloatTensor`, *optional*):
--                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
--                query_sequence_length, key_sequence_length)` if default attention is used.
--            output_attentions (`bool`, *optional*):
--                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
--                returned tensors for more detail.
--            use_cache (`bool`, *optional*):
--                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
--                (see `past_key_values`).
--            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
--        """
--        if "padding_mask" in kwargs:
--            warnings.warn(
--                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
--            )
--
--        residual = hidden_states
--
--        hidden_states = self.input_layernorm(hidden_states)
--
--        # Self Attention
--        hidden_states, self_attn_weights, present_key_value = self.self_attn(
--            hidden_states=hidden_states,
--            attention_mask=attention_mask,
--            position_ids=position_ids,
--            past_key_value=past_key_value,
--            output_attentions=output_attentions,
--            use_cache=use_cache,
--            **kwargs,
--        )
--        hidden_states = residual + hidden_states
--
--        # Fully Connected
--        residual = hidden_states
--        hidden_states = self.post_attention_layernorm(hidden_states)
--        hidden_states = self.mlp(hidden_states)
--        hidden_states = residual + hidden_states
--
--        outputs = (hidden_states,)
--
--        if output_attentions:
--            outputs += (self_attn_weights,)
--
--        if use_cache:
--            outputs += (present_key_value,)
--
--        return outputs
--
--
--LLAMA_START_DOCSTRING = r"""
--    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
--    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
--    etc.)
--
--    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
--    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
--    and behavior.
--
--    Parameters:
--        config ([`LlamaConfig`]):
--            Model configuration class with all the parameters of the model. Initializing with a config file does not
--            load the weights associated with the model, only the configuration. Check out the
--            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
--"""
--
--
--@add_start_docstrings(
--    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
--    LLAMA_START_DOCSTRING,
--)
--class LlamaPreTrainedModel(PreTrainedModel):
--    config_class = LlamaConfig
--    base_model_prefix = "model"
--    supports_gradient_checkpointing = True
--    _no_split_modules = ["LlamaDecoderLayer"]
--    _skip_keys_device_placement = "past_key_values"
--    _supports_flash_attn_2 = True
--
--    def _init_weights(self, module):
--        std = self.config.initializer_range
--        if isinstance(module, nn.Linear):
--            module.weight.data.normal_(mean=0.0, std=std)
--            if module.bias is not None:
--                module.bias.data.zero_()
--        elif isinstance(module, nn.Embedding):
--            module.weight.data.normal_(mean=0.0, std=std)
--            if module.padding_idx is not None:
--                module.weight.data[module.padding_idx].zero_()
--
--
--LLAMA_INPUTS_DOCSTRING = r"""
--    Args:
--        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
--            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
--            it.
--
--            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
--            [`PreTrainedTokenizer.__call__`] for details.
--
--            [What are input IDs?](../glossary#input-ids)
--        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
--            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
--
--            - 1 for tokens that are **not masked**,
--            - 0 for tokens that are **masked**.
--
--            [What are attention masks?](../glossary#attention-mask)
--
--            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
--            [`PreTrainedTokenizer.__call__`] for details.
--
--            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
--            `past_key_values`).
--
--            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
--            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
--            information on the default strategy.
--
--            - 1 indicates the head is **not masked**,
--            - 0 indicates the head is **masked**.
--        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
--            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
--            config.n_positions - 1]`.
--
--            [What are position IDs?](../glossary#position-ids)
--        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
--            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
--            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
--            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
--
--            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
--            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
--
--            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
--            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
--            of shape `(batch_size, sequence_length)`.
--        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
--            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
--            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
--            model's internal embedding lookup matrix.
--        use_cache (`bool`, *optional*):
--            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
--            `past_key_values`).
--        output_attentions (`bool`, *optional*):
--            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
--            tensors for more detail.
--        output_hidden_states (`bool`, *optional*):
--            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
--            more detail.
--        return_dict (`bool`, *optional*):
--            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
--"""
--
--
--@add_start_docstrings(
--    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
--    LLAMA_START_DOCSTRING,
--)
--class LlamaModel(LlamaPreTrainedModel):
--    """
--    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
--
--    Args:
--        config: LlamaConfig
--    """
--
--    def __init__(self, config: LlamaConfig):
--        super().__init__(config)
--        self.padding_idx = config.pad_token_id
--        self.vocab_size = config.vocab_size
--
--        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
--        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
--        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
--
--        self.gradient_checkpointing = False
--        # Initialize weights and apply final processing
--        self.post_init()
--
--    def get_input_embeddings(self):
--        return self.embed_tokens
--
--    def set_input_embeddings(self, value):
--        self.embed_tokens = value
--
--    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
--    def forward(
--        self,
--        input_ids: torch.LongTensor = None,
--        attention_mask: Optional[torch.Tensor] = None,
--        position_ids: Optional[torch.LongTensor] = None,
--        past_key_values: Optional[List[torch.FloatTensor]] = None,
--        inputs_embeds: Optional[torch.FloatTensor] = None,
--        use_cache: Optional[bool] = None,
--        output_attentions: Optional[bool] = None,
--        output_hidden_states: Optional[bool] = None,
--        return_dict: Optional[bool] = None,
--    ) -> Union[Tuple, BaseModelOutputWithPast]:
--        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
--        output_hidden_states = (
--            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
--        )
--        use_cache = use_cache if use_cache is not None else self.config.use_cache
--
--        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--
--        # retrieve input_ids and inputs_embeds
--        if input_ids is not None and inputs_embeds is not None:
--            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
--        elif input_ids is not None:
--            batch_size, seq_length = input_ids.shape[:2]
--        elif inputs_embeds is not None:
--            batch_size, seq_length = inputs_embeds.shape[:2]
--        else:
--            raise ValueError("You have to specify either input_ids or inputs_embeds")
--
--        past_key_values_length = 0
--        if past_key_values is not None:
--            past_key_values_length = past_key_values[0][0].shape[2]
--        
--        # new_key_values_shape=past_key_values.shape
--        # new_key_values_shape[-2]=seq_length
--        # next_decoder_cache=torch.empty(new_key_values_shape,dtype=past_key_values[0][0].dtype)
--
--
--        if position_ids is None:
--            device = input_ids.device if input_ids is not None else inputs_embeds.device
--            position_ids = torch.arange(
--                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
--            )
--            position_ids = position_ids.unsqueeze(0)
--
--        if inputs_embeds is None:
--            inputs_embeds = self.embed_tokens(input_ids)
--
--        if getattr(self.config, "_flash_attn_2_enabled", False):
--            # 2d mask is passed through the layers
--            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
--        else:
--            # 4d mask is passed through the layers
--            attention_mask = _prepare_4d_causal_attention_mask(
--                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
--            )
--
--        # embed positions
--        hidden_states = inputs_embeds
--
--        if self.gradient_checkpointing and self.training:
--            if use_cache:
--                logger.warning_once(
--                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
--                )
--                use_cache = False
--
--        # decoder layers
--        all_hidden_states = () if output_hidden_states else None
--        all_self_attns = () if output_attentions else None
--        next_decoder_cache = [] if use_cache else None
--
--        for idx, decoder_layer in enumerate(self.layers):
--            if output_hidden_states:
--                all_hidden_states += (hidden_states,)
--
--            past_key_value = past_key_values[idx] if past_key_values is not None else None
--
--            if self.gradient_checkpointing and self.training:
--                layer_outputs = self._gradient_checkpointing_func(
--                    decoder_layer.__call__,
--                    hidden_states,
--                    attention_mask,
--                    position_ids,
--                    past_key_value,
--                    output_attentions,
--                    use_cache,
--                )
--            else:
--                layer_outputs = decoder_layer(
--                    hidden_states,
--                    attention_mask=attention_mask,
--                    position_ids=position_ids,
--                    past_key_value=past_key_value,
--                    output_attentions=output_attentions,
--                    use_cache=use_cache,
--                )
--
--            hidden_states = layer_outputs[0]
--
--            if use_cache:
--                key_values= layer_outputs[2 if output_attentions else 1]
--                # next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
--                # next_decoder_cache[idx][0] = key_values[0]
--                # next_decoder_cache[idx][1] = key_values[1]
--                next_decoder_cache.extend(layer_outputs[2 if output_attentions else 1])
--
--            if output_attentions:
--                all_self_attns += (layer_outputs[1],)
--
--        hidden_states = self.norm(hidden_states)
--
--        # add hidden states from the last decoder layer
--        if output_hidden_states:
--            all_hidden_states += (hidden_states,)
--
--        next_cache = torch.concat(next_decoder_cache).reshape(len(self.layers),2,*next_decoder_cache[0].shape) if use_cache else None
--        if output_attentions:
--            all_self_attns = torch.concat(all_self_attns).reshape(len(self.layers),*all_self_attns[0].shape)
--        if not return_dict:
--            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
--        return BaseModelOutputWithPast(
--            last_hidden_state=hidden_states,
--            past_key_values=next_cache,
--            hidden_states=all_hidden_states,
--            attentions=all_self_attns,
--        )
--
--
--class LlamaForCausalLM(LlamaPreTrainedModel):
--    _tied_weights_keys = ["lm_head.weight"]
--
--    def __init__(self, config):
--        super().__init__(config)
--        self.model = LlamaModel(config)
--        self.vocab_size = config.vocab_size
--        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
--
--        # Initialize weights and apply final processing
--        self.post_init()
--
--    def get_input_embeddings(self):
--        return self.model.embed_tokens
--
--    def set_input_embeddings(self, value):
--        self.model.embed_tokens = value
--
--    def get_output_embeddings(self):
--        return self.lm_head
--
--    def set_output_embeddings(self, new_embeddings):
--        self.lm_head = new_embeddings
--
--    def set_decoder(self, decoder):
--        self.model = decoder
--
--    def get_decoder(self):
--        return self.model
--
--    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
--    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
--    def forward(
--        self,
--        input_ids: torch.LongTensor = None,
--        attention_mask: Optional[torch.Tensor] = None,
--        position_ids: Optional[torch.LongTensor] = None,
--        past_key_values: Optional[List[torch.FloatTensor]] = None,
--        inputs_embeds: Optional[torch.FloatTensor] = None,
--        labels: Optional[torch.LongTensor] = None,
--        use_cache: Optional[bool] = None,
--        output_attentions: Optional[bool] = None,
--        output_hidden_states: Optional[bool] = None,
--        return_dict: Optional[bool] = None,
--    ) -> Union[Tuple, CausalLMOutputWithPast]:
--        r"""
--        Args:
--            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
--                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
--                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
--                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
--
--        Returns:
--
--        Example:
--
--        ```python
--        >>> from transformers import AutoTokenizer, LlamaForCausalLM
--
--        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
--        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
--
--        >>> prompt = "Hey, are you conscious? Can you talk to me?"
--        >>> inputs = tokenizer(prompt, return_tensors="pt")
--
--        >>> # Generate
--        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
--        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
--        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
--        ```"""
--
--        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
--        output_hidden_states = (
--            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
--        )
--        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--
--        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
--        outputs = self.model(
--            input_ids=input_ids,
--            attention_mask=attention_mask,
--            position_ids=position_ids,
--            past_key_values=past_key_values,
--            inputs_embeds=inputs_embeds,
--            use_cache=use_cache,
--            output_attentions=output_attentions,
--            output_hidden_states=output_hidden_states,
--            return_dict=return_dict,
--        )
--
--        hidden_states = outputs[0]
--        if self.config.pretraining_tp > 1:
--            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
--            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
--            logits = torch.cat(logits, dim=-1)
--        else:
--            logits = self.lm_head(hidden_states)
--        logits = logits.float()
--
--        loss = None
--        if labels is not None:
--            # Shift so that tokens < n predict n
--            shift_logits = logits[..., :-1, :].contiguous()
--            shift_labels = labels[..., 1:].contiguous()
--            # Flatten the tokens
--            loss_fct = CrossEntropyLoss()
--            shift_logits = shift_logits.view(-1, self.config.vocab_size)
--            shift_labels = shift_labels.view(-1)
--            # Enable model parallelism
--            shift_labels = shift_labels.to(shift_logits.device)
--            loss = loss_fct(shift_logits, shift_labels)
--
--        if not return_dict:
--            output = (logits,) + outputs[1:]
--            return (loss,) + output if loss is not None else output
--
--        return CausalLMOutputWithPast(
--            loss=loss,
--            logits=logits,
--            past_key_values=outputs.past_key_values,
--            hidden_states=outputs.hidden_states,
--            attentions=outputs.attentions,
--        )
--
--    def prepare_inputs_for_generation(
--        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
--    ):
--        if past_key_values is not None:
--            past_length = past_key_values[0][0].shape[2]
--
--            # Some generation methods already pass only the last input ID
--            if input_ids.shape[1] > past_length:
--                remove_prefix_length = past_length
--            else:
--                # Default to old behavior: keep only final ID
--                remove_prefix_length = input_ids.shape[1] - 1
--
--            input_ids = input_ids[:, remove_prefix_length:]
--
--        position_ids = kwargs.get("position_ids", None)
--        if attention_mask is not None and position_ids is None:
--            # create position_ids on the fly for batch generation
--            position_ids = attention_mask.long().cumsum(-1) - 1
--            position_ids.masked_fill_(attention_mask == 0, 1)
--            if past_key_values:
--                position_ids = position_ids[:, -input_ids.shape[1] :]
--
--        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
--        if inputs_embeds is not None and past_key_values is None:
--            model_inputs = {"inputs_embeds": inputs_embeds}
--        else:
--            model_inputs = {"input_ids": input_ids}
--
--        model_inputs.update(
--            {
--                "position_ids": position_ids,
--                "past_key_values": past_key_values,
--                "use_cache": kwargs.get("use_cache"),
--                "attention_mask": attention_mask,
--            }
--        )
--        return model_inputs
--
--    @staticmethod
--    def _reorder_cache(past_key_values, beam_idx):
--        reordered_past = ()
--        for layer_past in past_key_values:
--            reordered_past += (
--                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
--            )
--        return reordered_past
--
--
--@add_start_docstrings(
--    """
--    The LLaMa Model transformer with a sequence classification head on top (linear layer).
--
--    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
--    (e.g. GPT-2) do.
--
--    Since it does classification on the last token, it requires to know the position of the last token. If a
--    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
--    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
--    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
--    each row of the batch).
--    """,
--    LLAMA_START_DOCSTRING,
--)
--class LlamaForSequenceClassification(LlamaPreTrainedModel):
--    def __init__(self, config):
--        super().__init__(config)
--        self.num_labels = config.num_labels
--        self.model = LlamaModel(config)
--        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
--
--        # Initialize weights and apply final processing
--        self.post_init()
--
--    def get_input_embeddings(self):
--        return self.model.embed_tokens
--
--    def set_input_embeddings(self, value):
--        self.model.embed_tokens = value
--
--    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
--    def forward(
--        self,
--        input_ids: torch.LongTensor = None,
--        attention_mask: Optional[torch.Tensor] = None,
--        position_ids: Optional[torch.LongTensor] = None,
--        past_key_values: Optional[List[torch.FloatTensor]] = None,
--        inputs_embeds: Optional[torch.FloatTensor] = None,
--        labels: Optional[torch.LongTensor] = None,
--        use_cache: Optional[bool] = None,
--        output_attentions: Optional[bool] = None,
--        output_hidden_states: Optional[bool] = None,
--        return_dict: Optional[bool] = None,
--    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
--        r"""
--        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
--            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
--            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
--        """
--        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--
--        transformer_outputs = self.model(
--            input_ids,
--            attention_mask=attention_mask,
--            position_ids=position_ids,
--            past_key_values=past_key_values,
--            inputs_embeds=inputs_embeds,
--            use_cache=use_cache,
--            output_attentions=output_attentions,
--            output_hidden_states=output_hidden_states,
--            return_dict=return_dict,
--        )
--        hidden_states = transformer_outputs[0]
--        logits = self.score(hidden_states)
--
--        if input_ids is not None:
--            batch_size = input_ids.shape[0]
--        else:
--            batch_size = inputs_embeds.shape[0]
--
--        if self.config.pad_token_id is None and batch_size != 1:
--            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
--        if self.config.pad_token_id is None:
--            sequence_lengths = -1
--        else:
--            if input_ids is not None:
--                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
--                    logits.device
--                )
--            else:
--                sequence_lengths = -1
--
--        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
--
--        loss = None
--        if labels is not None:
--            labels = labels.to(logits.device)
--            if self.config.problem_type is None:
--                if self.num_labels == 1:
--                    self.config.problem_type = "regression"
--                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
--                    self.config.problem_type = "single_label_classification"
--                else:
--                    self.config.problem_type = "multi_label_classification"
--
--            if self.config.problem_type == "regression":
--                loss_fct = MSELoss()
--                if self.num_labels == 1:
--                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
--                else:
--                    loss = loss_fct(pooled_logits, labels)
--            elif self.config.problem_type == "single_label_classification":
--                loss_fct = CrossEntropyLoss()
--                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
--            elif self.config.problem_type == "multi_label_classification":
--                loss_fct = BCEWithLogitsLoss()
--                loss = loss_fct(pooled_logits, labels)
--        if not return_dict:
--            output = (pooled_logits,) + transformer_outputs[1:]
--            return ((loss,) + output) if loss is not None else output
--
--        return SequenceClassifierOutputWithPast(
--            loss=loss,
--            logits=pooled_logits,
--            past_key_values=transformer_outputs.past_key_values,
--            hidden_states=transformer_outputs.hidden_states,
--            attentions=transformer_outputs.attentions,
--        )
-diff -uNr ascend-llm/export_llama/quantize.py ascend-llm-qwen/export_llama/quantize.py
---- ascend-llm/export_llama/quantize.py	2024-09-04 19:21:03.078081000 +0800
-+++ ascend-llm-qwen/export_llama/quantize.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,183 +0,0 @@
--import torch
--from torch import nn,Tensor
--from typing import Optional,List,Tuple
--from torch.onnx.symbolic_helper import parse_args
--
--class MatMulInteger(torch.autograd.Function):
--    @staticmethod
--    def forward(ctx, x:torch.Tensor,weight_t:torch.Tensor):               
--        res = torch.matmul(x.to(dtype=torch.float32),weight_t.to(torch.float32))
--        # res=torch.matmul(x.to(dtype=torch.int32,device="cpu") , # torch不支持CUDA上的int8矩阵乘
--        #                             weight_t.to(dtype=torch.int32,device="cpu")).to(x.device)
--        return res
--
--    @staticmethod
--    @parse_args("v","v")
--    def symbolic(g:torch._C.Graph, x:torch.Tensor,weight_t:torch.Tensor):
--        return g.op("MatMulInteger", x,weight_t)
--
--matmulInteger = MatMulInteger.apply
--
--def quantize_mat(mat:Tensor)-> Tuple[Tensor,Tensor]:
--    # max_val = torch.max(torch.abs(mat),dim=-1)[0]
--    # mat =  (mat * (127 / max_val)[...,None]).to(dtype=torch.int8)
--    max_val = (torch.max(torch.abs(mat),dim=-1)[0] / 127.0).to(dtype=mat.dtype)
--    mat =  (mat / max_val[...,None]).to(dtype=torch.int8)
--    return mat, max_val
--
--def dequantize_mat(mat:Tensor,max_val:Tensor):
--    return torch.mul(mat,max_val.unsqueeze(-1))
--
--def decomposition(mat:Tensor,unq_idx:Tensor,t:Tensor) -> Tuple[Tensor,Tensor,Tensor,Tensor]:
--    return mat.mul(t.to(dtype=mat.dtype)),mat[...,unq_idx]
--    mat=mat.clone()
--    mat_unq = mat[...,unq_idx]
--    if mat.dim() == 3:
--        mat[:,:,unq_idx] = 0
--    elif mat.dim() == 4:
--        mat[:,:,:,unq_idx] = 0
--    elif mat.dim() == 2:
--        mat[:,unq_idx] = 0
--    return mat,mat_unq
--
--def get_unq_idx_topk(mat:Tensor,k:int=64):
--    idx=torch.topk(mat.view(-1,mat.shape[-1]).abs().max(dim=-2)[0],k,dim=-1)[1]
--    t = torch.ones((mat.shape[-1]),dtype=mat.dtype,device=mat.device)
--    t = t.clone()
--    t[idx] = 0
--    return idx,t
--
--def get_unq_idx_thres(mat:Tensor,threshold:float=6.0):
--    k = mat.view(-1,mat.shape[-1]).abs().max(dim=-2)[0] >= threshold
--    return k.nonzero().view(-1), k
--
--def qMatmul(x_q:Tensor,x_max:Tensor,weight_q:Tensor,w_max:Tensor,dtype):
--    res_q = matmulInteger(x_q , weight_q)
--    mx = nn.functional.linear(x_max.unsqueeze(-1),w_max.unsqueeze(-1))
--    res = torch.mul(res_q.to(device=mx.device,dtype=torch.float32), mx.to(torch.float32) ).to(dtype=dtype)  
--    # res = torch.mul((res_q.to(device=mx.device,dtype=torch.float32) / (127.0*127.0)).to(torch.float16), mx )  
--    return res
--
--class W8Linear(nn.Module):
--    def __init__(self, origin_weight:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
--        super().__init__()
--        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
--        self.dtype = origin_weight.dtype
--        self.alpha = alpha
--        self.weight_q,self.max_val = quantize_mat(origin_weight.detach())
--        self.weight_q = nn.Parameter(self.weight_q,requires_grad=False)
--        self.max_val = nn.Parameter(self.max_val,requires_grad=False)
--
--    def forward(self,x:Tensor) -> Tensor:
--        return nn.functional.linear(x,dequantize_mat(self.weight_q,self.max_val),bias=self.bias)
--
--# act_max for smooth 
--class W8X8Linear(nn.Module):
--    def __init__(self, ori_w:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
--        super().__init__()
--        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
--        self.dtype = ori_w.dtype
--        self.alpha = alpha
--        self.scales = None
--        if act_max is not None:
--            act_max = act_max.to(ori_w.device)
--            self.scales = (act_max.pow(alpha) / ori_w.abs().max(dim=0)[0].pow(1 - alpha)).clamp(min=1e-5).to(dtype=ori_w.dtype)
--            self.scales = nn.Parameter(self.scales,requires_grad=False)
--            ori_w = ori_w.detach().mul(self.scales)
--        self.weight_q,self.max_val = quantize_mat(ori_w.detach())
--        self.weight_q = nn.Parameter(self.weight_q.t(),requires_grad=False)
--        self.max_val = nn.Parameter(self.max_val,requires_grad=False)
--
--    def forward(self,x:Tensor) -> Tensor:
--        if self.scales is not None:
--            x = x.div(self.scales)
--        x_q,x_max = quantize_mat(x)
--        res = qMatmul(x_q,x_max,self.weight_q,self.max_val,x.dtype)
--        if self.bias is not None:
--            res = res + self.bias
--        return res
--
--# static decomposition
--class W8SDLinear(nn.Module):
--    def __init__(self, origin_weight:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
--        super().__init__()
--        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
--        self.dtype = origin_weight.dtype
--        self.alpha = alpha
--        if act_max is not None:
--            self.idx_unq,self.t = get_unq_idx_topk(act_max,self.alpha)
--        else:
--            self.idx_unq,self.t = get_unq_idx_topk(origin_weight,self.alpha)
--        self.idx_unq,self.t = self.idx_unq.to(origin_weight.device),self.t.to(origin_weight.device)
--        self.weight_q,self.weight_unq = decomposition(origin_weight,self.idx_unq,self.t)
--        self.weight_q,self.w_max = quantize_mat(self.weight_q.detach())
--        self.weight_q = nn.Parameter(self.weight_q.t(),requires_grad=False)
--        self.weight_unq = nn.Parameter(self.weight_unq.t(),requires_grad=False)
--        self.w_max = nn.Parameter(self.w_max,requires_grad=False)
--        self.t = nn.Parameter(self.t,requires_grad=False)
--        self.idx_unq = nn.Parameter(self.idx_unq,requires_grad=False)
--
--    def forward(self,x:Tensor) -> Tensor:
--        x_q,x_unq = decomposition(x,self.idx_unq,self.t)
--        x_q,x_max = quantize_mat(x_q)
--        res_q = qMatmul(x_q,x_max,self.weight_q,self.w_max,x.dtype)
--        res_unq = torch.matmul(x_unq, self.weight_unq)
--        if self.bias is not None:
--            res_unq += self.bias
--        return res_q + res_unq
--    
--class W8DXLinear(nn.Module):
--    def __init__(self, origin_weight:Tensor, bias: Optional[Tensor] = None,act_max:Optional[Tensor] = None,alpha=32):
--        super().__init__()
--        self.bias = None if bias is None else nn.Parameter(bias,requires_grad=False)
--        self.dtype = origin_weight.dtype
--        self.alpha = alpha
--        self.weight_q,self.max_val = quantize_mat(origin_weight.detach())
--        self.weight_q = nn.Parameter(self.weight_q.t(),requires_grad=False)
--        self.max_val = nn.Parameter(self.max_val,requires_grad=False)
--
--    def forward(self,x:Tensor) -> Tensor:
--        idx_unq,t = get_unq_idx_topk(x,self.alpha)
--        x_q,x_unq = decomposition(x,idx_unq,t)
--        x_q,x_max = quantize_mat(x_q)
--        res_q = qMatmul(x_q,x_max,self.weight_q,self.max_val,x.dtype)
--        weight_unq= torch.mul(self.weight_q[idx_unq,:],self.max_val.unsqueeze(0))
--        res_unq = torch.matmul(x_unq, weight_unq)
--        if self.bias is not None:
--            res_unq += self.bias
--        return res_q + res_unq
--
--
--quant_cls = {
--    "W8":W8Linear,
--    "W8X8":W8X8Linear,
--    "W8SD":W8SDLinear,
--    "W8DX":W8DXLinear
--}         
--        
--def replace_linear_modules(module:nn.Module,prefix:str,act_scales,cfg):
--    for name, child in module.named_children():
--        fullname = (prefix + '.' + name) if prefix != '' else name
--        if isinstance(child, nn.Linear):
--            strs = fullname.split(".")
--            # fullname: model.layers.21.self_attn.q_proj layer_name: 21.q_proj; name: q_proj
--            # fullname: lm_head; layer_name: 21.q_proj; name: q_proj;
--            layer_name = (strs[-3] + "." + strs[-1]) if len(strs) > 2 else strs[-1]
--            if layer_name not in cfg:
--                continue
--            act_scale = None if act_scales is None or 'act_scale' not in cfg[layer_name] else act_scales[fullname]
--            alpha = 32 if 'alpha' not in cfg[layer_name] else cfg[layer_name]['alpha']
--            setattr(module, name,quant_cls[cfg[layer_name]['type']]
--                    (child.weight,child.bias,act_max=act_scale,alpha=alpha))
--        else:
--            replace_linear_modules(child,fullname,act_scales,cfg)
--
--def quantize(model:nn.Module,cfg={}):
--    act_scales = None
--    if 'act_scales_path' in cfg:
--        act_scales = torch.load(cfg['act_scales_path'])
--        if 'smooth' in cfg:
--            from smooth import smooth_lm
--            alpha = 0.85 if "alpha" not in cfg else cfg["alpha"]
--            smooth_lm(model, act_scales, alpha)
--    replace_linear_modules(model,'',act_scales,cfg)
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/readme.md ascend-llm-qwen/export_llama/readme.md
---- ascend-llm/export_llama/readme.md	2024-09-04 19:21:03.078081000 +0800
-+++ ascend-llm-qwen/export_llama/readme.md	1970-01-01 08:00:00.000000000 +0800
-@@ -1,70 +0,0 @@
--# Llama 模型导出
--
--## 量化
--
--主要参考的量化方案有[LLM.int8](https://arxiv.org/abs/2208.07339)和[SmoothQuant](https://arxiv.org/abs/2211.10438)
--
--量化需要引入`quantize.py`和config文件下的配置文件，目前量化方式共有四种：int8仅权重量化(W8)，int8全量化(W8X8，传入act_scale可以平滑激活)，静态混合精度分解(SD)和动态混合精度分解(W8DX)。根据Llama模型特点，down_proj比qkv_proj, up_proj, gate_proj更难以量化，更深的Decoder Layer更难以量化。配置文件的格式为
--```python
--quantize_cfg = {
--    "0.q_proj":{ # 字典的key为具体某一层，第几个Decoder Layer+线性层名字
--        "type":"W8SD", # 量化类型
--		"act_scale":True, # type=W8X8表示采用平滑激活，type=W8SD表示用生成的act scale进行静态混合精度分解，如果不采用直接不填这一项，判断时只判断是否存在字典中是否存在act_scale的key-value对，不检查值。
--		"alpha":0.85 # 平滑激活的迁移系数，混合精度分解的将多少特征保留为FP16
--    },
--	"act_scales_path":"/root/zanilia/export_llama/act_scales/llama-2-7b.pt",
--	"smooth":True, # SmoothQuant的方案，将激活值的缩放与RMSNorm融合，不会造成额外的开销，但down_proj层无法使用
--	"alpha":0.85, #SmoothQuant的迁移系数
--}
--```
--创建新的配置文件方式，新建一个python源文件并提供get函数接口，参数为模型配置和act_scale路径，返回dict格式的量化配置。
--在config文件夹下，提供了几个常用的量化配置：int8仅权重量化(w8.py)，int8全量化(w8x8.py)，静态混合精度分解(sd.py)，动态混合精度分解(w8dx.py)，平滑激活(smooth.py)，平滑+静态混合精度分解(smsd.py)。
--
--## 代码解析
--
--### export_llama.py
--
--通过`python export_llama.py`导出onnx
--
--### modeling_llama.py
--
--对llama模型进行修改，主要修改内容
--1. 只返回新生成的KV缓存（默认返回所有KV缓存），将返回KV缓存有tuple变为Tensor（torch.cat）
--2. 修改LlamaRotaryEmbedding类，原来的方式采用cache的方式，调用时参数为seq_len，如果返回cache的前seq len个元素（如果不足，则需要再次生成）。修改后，调用每次返回max_position_embeddings个元素。所有元素都提前生成了，seq len参数没有使用，不会再次生成，在导出前应将max_position_embeddings设置大一些。
--	修改原因主要是：调用LlamaRotaryEmbedding的seq len，为输入长度+kv长度。在apply_rotary_pos_emb，使用position ids为下标取LlamaRotaryEmbedding的输出，获得世纪的PosEmbedding。转om时，这输入长度+kv长度是固定值，如果通过StreamingLLM，H2O等方式驱逐KV缓存，position_ids会超过输入长度+kv长度，导致错误。也可以修改代码获取真实的输入长度+kv长度。
--
--### export_llama.py
--
--将llama模型导出为onnx文件
--
--## quantize.py
--
--量化相关代码，总共有四种方法。
--
--1. W8Linear: int8仅权重量化
--2. W8X8Linear: vector-wise absmax int8全量化
--3. W8SDLinear: 静态的混合精度分解，分解方式可以使用按权重分解和按act max分解（推荐按act max分解）
--4. W8DXLinear: 动态根据输入进行混合精度分解
--
--### smooth.py
--
--直接使用[SmoothQuant](https://github.com/mit-han-lab/smoothquant/) 的相关代码，对激活进行平滑，降低量化难度，调用smooth_lm接口进行量化。smoothquant目前对于q_proj,k_proj,v_proj,gate_proj,up_porj进行平滑，对于down_proj的平滑，可以在W8X8Linear参数中传入act max。
--
--### generate_act_scales.py
--
--直接使用[SmoothQuant](https://github.com/mit-han-lab/smoothquant/) 的相关代码，可以计算某个模型在特定数据集上激活值的最大值，可以用于smoothquant方法的平滑操作和W8SDLinear的混合精度分解。
--
--```bash
--python examples/generate_act_scales.py \
--    --model-name <model_name_or_path> \
--    --output-path <output_act_scales_file_path> \
--    --num-samples <num_samples> \
--    --seq-len <sequence_length> \
--    --dataset-path <path_to_the_calibration_dataset>
--```
--
--### change_node.py
--
--将cast fp->int8算子转换为AscendQuant算子，用于atc模型转换
--
--
-diff -uNr ascend-llm/export_llama/requirements.txt ascend-llm-qwen/export_llama/requirements.txt
---- ascend-llm/export_llama/requirements.txt	2024-09-04 19:21:03.078081000 +0800
-+++ ascend-llm-qwen/export_llama/requirements.txt	1970-01-01 08:00:00.000000000 +0800
-@@ -1,4 +0,0 @@
--torch
--transformers==4.35
--onnx
--lm-eval==0.4.2 # for eval
-\ No newline at end of file
-diff -uNr ascend-llm/export_llama/smooth.py ascend-llm-qwen/export_llama/smooth.py
---- ascend-llm/export_llama/smooth.py	2024-09-04 19:21:03.078081000 +0800
-+++ ascend-llm-qwen/export_llama/smooth.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,56 +0,0 @@
--'''
--code from https://github.com/mit-han-lab/smoothquant/
--'''
--import torch
--import torch.nn as nn
--
--from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
--
--@torch.no_grad()
--def smooth_ln_fcs_llama_like(ln, fcs, act_scales, alpha=0.5):
--    if not isinstance(fcs, list):
--        fcs = [fcs]
--    assert isinstance(ln, (LlamaRMSNorm,nn.Linear))
--    for fc in fcs:
--        assert isinstance(fc, nn.Linear)
--        assert ln.weight.shape[0] == fc.in_features == act_scales.numel()
--    device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
--    act_scales = act_scales.to(device=device, dtype=dtype)
--    weight_scales = torch.cat(
--        [fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0
--    )
--    weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)
--    scales = (
--        (act_scales.pow(alpha) / weight_scales.pow(1 - alpha))
--        .clamp(min=1e-5)
--        .to(device)
--        .to(dtype)
--    )
--    if ln.weight.dim() == 2:
--        ln.weight.div_(scales.unsqueeze(-1))
--    else:
--        ln.weight.div_(scales)
--    for fc in fcs:
--        fc.weight.mul_(scales.view(1, -1))
--
--
--@torch.no_grad()
--def smooth_lm(model, scales, alpha=0.5):
--    for name, module in model.named_modules():
--        if isinstance(module, LlamaDecoderLayer):
--            attn_ln = module.input_layernorm  # attention forward norm
--            qkv = [
--                module.self_attn.q_proj,
--                module.self_attn.k_proj,
--                module.self_attn.v_proj,
--            ]
--
--            qkv_input_scales = scales[name + ".self_attn.q_proj"]
--            smooth_ln_fcs_llama_like(attn_ln, qkv, qkv_input_scales, alpha)
--
--            ffn_ln = module.post_attention_layernorm  # feed forward norm
--            fcs = [module.mlp.gate_proj, module.mlp.up_proj]
--            fcs_input_scales = scales[name + ".mlp.gate_proj"]
--
--            smooth_ln_fcs_llama_like(ffn_ln, fcs, fcs_input_scales, alpha)
--            # smooth_ln_fcs_llama_like(module.mlp.up_proj,module.mlp.down_proj,scales[name + ".mlp.down_proj"],0.9)
-\ No newline at end of file
++    ).npu()
+     model_cfg=model.model.config
+     spec = importlib.util.spec_from_file_location("quant_cfg_module", quant_cfg_path)
+     quant_cfg_module = importlib.util.module_from_spec(spec)
 diff -uNr ascend-llm/inference/config.py ascend-llm-qwen/inference/config.py
---- ascend-llm/inference/config.py	2024-09-04 19:21:03.079083400 +0800
-+++ ascend-llm-qwen/inference/config.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,64 +0,0 @@
--from dataclasses import dataclass,field
--from typing import Optional,Union,List,Dict
--import os
--import json
--
--@dataclass
--class InferenceConfig:
--    tokenizer: str = ""
--    hf_model_dir:str = "" # huggingface 模型目录，包含tokenizer和config.json
--    sampling_method: str = "top_k" # "greedy" | "top_k" | "top_p"
--    sampling_value: float = 10 # k for top_k p for top_p
--    temperature: float = 0.7
--    max_length:int = 512 # 输出长度的最大值
--    max_input_len:int = 1 # 每次推理输入的最大长度为max_input_len，对om目前设置为1
--    session_type:str="acl" # onnx或者acl
--    acl_mode="rc" #rc模式下host和device是同一块内存，可以对执行流程进行优化
--    device:int=0
--    # prompt:List[Dict[str,str]] = field(default_factory=list)
--    prompt:List[Dict[str,str]] = field(default_factory=lambda: [
--        {"role":"user","content":"Hey there I am a human that would like to have a conversation with you."},
--        {"role":"assistant","content":"Sure, I am happy to answer your questions"},
--        {"role":"user","content":"Great, I insist that we take turns."},
--        {"role":"assistant","content":"I agree, we should take turns."},
--    ])
--    model:str=""
--    kvcache_method:str = "sliding-window" # "basic"|"sliding-window"|'streamllm'|'H2O'
--    kvcache_fixsize:bool = True # 输入的kv缓存是否固定shape
--    head_len:int= 32 # 在KVCache evict时前head len会被保留
--    recent_len:int = 32 # 在KVCache evict时最近recent len会被保留
--    evict_len:int = 64 # KVCache 逐出的最小值，当KVCache达到最大值时将逐出evict_len个KVCache
+--- ascend-llm/inference/config.py	2024-09-05 15:10:55.833305200 +0800
++++ ascend-llm-qwen/inference/config.py	2024-09-05 15:20:36.210316200 +0800
+@@ -28,9 +28,9 @@
+     head_len:int= 32 # 在KVCache evict时前head len会被保留
+     recent_len:int = 32 # 在KVCache evict时最近recent len会被保留
+     evict_len:int = 64 # KVCache 逐出的最小值，当KVCache达到最大值时将逐出evict_len个KVCache
 -    n_layer:int = 22
--    format:str='huggingface-tensor' #KVcache的格式
++    n_layer:int = 28
+     format:str='huggingface-tensor' #KVcache的格式
 -    max_cache_size:int=256 # kvcache的最大长度
--    head_num:int=4
--    num_kv_group:int = 8 # for GQA
--    head_dim:int=64
--    hidden_dim:int=2048
--    dtype:str="float16"
--    model_type:str="llama-2-7b"
--    
--    def __post_init__(self):
--        assert(self.kvcache_method in ["basic","sliding-window",'streamllm','H2O'])
--        assert(os.path.isdir(self.hf_model_dir))
--        assert(self.session_type in ["acl","onnx"])
--        if self.session_type == "onnx":
--            self.max_input_len = self.max_length
--        self.evict_len = int(min((self.max_cache_size - self.head_len )/2,self.evict_len ))
--        self.max_input_len = int(min(self.max_input_len,self.evict_len))
--        self.tokenizer = self.hf_model_dir
--        model_desc = None
--        with open(self.hf_model_dir+"/config.json") as f:
--            model_desc = json.load(f)
--        self.n_layer = model_desc['num_hidden_layers']
--        self.head_num = model_desc['num_key_value_heads']
--        self.num_kv_group = int(model_desc['num_attention_heads'] / self.head_num)
--        self.hidden_dim = model_desc["hidden_size"]
--        self.head_dim = int(self.hidden_dim / model_desc['num_attention_heads'])
--        if self.hidden_dim == 2048:
--            self.model_type = "tiny-llama"
--        if self.kvcache_method == "streamllm":
--            assert(self.head_len+self.evict_len < self.max_cache_size)
--        if self.kvcache_method == "H2O":
--            self.evict_len = int(min((self.max_cache_size - self.head_len -self.recent_len )/2,self.evict_len ))
--            assert(self.head_len+self.recent_len+self.evict_len < self.max_cache_size)
-diff -uNr ascend-llm/inference/engine.py ascend-llm-qwen/inference/engine.py
---- ascend-llm/inference/engine.py	2024-09-04 19:21:03.079083400 +0800
-+++ ascend-llm-qwen/inference/engine.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,242 +0,0 @@
--import time
--from typing import Dict, List
--import acl
--import numpy as np
--import os
--import ctypes
--from ctypes import c_void_p, c_int, c_size_t, c_ulong, c_int64,POINTER
--ACL_MEM_MALLOC_HUGE_FIRST = 0
--ACL_MEMCPY_HOST_TO_DEVICE = 1
--ACL_MEMCPY_DEVICE_TO_HOST = 2
--ACL_MEM_MALLOC_NORMAL_ONLY = 2
--
--libc = ctypes.CDLL("libc.so.6")
--# mmap函数原型
--mmap_func = libc.mmap
--mmap_func.argtypes = [c_void_p, c_size_t, c_int, c_int, c_int, c_int64]
--mmap_func.restype = c_void_p
--
--# munmap函数原型
--munmap_func = libc.munmap
--munmap_func.argtypes = [c_void_p, c_size_t]
--munmap_func.restype = c_int
--def mmap_file(file_path):  
--    # 打开文件并获取文件描述符  
--    file_descriptor = os.open(file_path, os.O_RDONLY)  
--    file_size = os.lseek(file_descriptor, 0, os.SEEK_END)  
--    os.lseek(file_descriptor, 0, os.SEEK_SET)  
--    # 调用mmap映射文件到内存  
--    # PROT_READ和MAP_PRIVATE的值可能因系统而异，这里假设为1和2  
--    protection_flags = 1  # PROT_READ  
--    visibility_flags = 2  # MAP_PRIVATE  
--    mapped_memory = mmap_func(None, file_size, protection_flags, visibility_flags, file_descriptor, 0)    
--    if mapped_memory == -1:  
--        raise Exception("Error mapping the file.")  
--
--    # 关闭文件描述符，映射区域仍然有效  
--    os.close(file_descriptor)  
--    
--    # 返回映射区域的地址  
--    return mapped_memory,file_size
--def check_ret(str,ret):
--    if ret != 0:
--        print(f"return code is {ret}, detail: {str}",flush=True) 
--
--def initResource(device):
--    ret = acl.init()
--    check_ret("init", ret)
--    ret = acl.rt.set_device(device)
--    check_ret("set_device", ret)
--    context,ret = acl.rt.create_context(device)
--    check_ret("create_context", ret)
--    return context
--
--def destroyResource(device,context):
--    ret = acl.rt.reset_device(device)
--    ret = acl.finalize()
--    ret = acl.rt.destroy_context(context)
--
--ACL_FLOAT,ACL_FLOAT16,ACL_INT8,ACL_INT32,ACL_INT64 = 0,1,2,3,9
--NPY_FLOAT32,NPY_FLOAT16,NPY_INT8,NPY_INT32,NPY_INT64 = 11,23,1,5,7
--dtype2NpType = {ACL_FLOAT:np.float32,ACL_FLOAT16:np.float16,ACL_INT8:np.int8,ACL_INT32:np.int32,ACL_INT64:np.int64}
--dtypeMp = {ACL_FLOAT:NPY_FLOAT32,ACL_FLOAT16:NPY_FLOAT16,ACL_INT8:NPY_INT8,ACL_INT32:NPY_INT32,ACL_INT64:NPY_INT64}
--class ACLModel:
--    def __init__(self,model_path,mode="rc",context=None,callback=None):
--        self.context = context
--        self.model_id = None
--        self.model_desc = None
--        self.callback_func = callback 
--        self.tid = None
--        self.stream = None
--        self.callback_interval = 1
--        self.exit_flag = False
--        self.mode = mode
--        self.input_dataset, self.output_dataset = None, None
--        self.inputs:List[Dict[str,]] = []
--        self.outputs:List[Dict[str,]] =  []
--        self.in_arrs:List[np.ndarray] = []
--        self.out_arrs:List[np.ndarray] = []
--        self.loadModel(model_path)
--        self.allocateMem()
--        if not callback:
--            return
--        self.stream, ret = acl.rt.create_stream()
--        self.tid, ret = acl.util.start_thread(self._process_callback,
--                                         [self.context, 50])
--        check_ret("acl.util.start_thread", ret)
--        ret = acl.rt.subscribe_report(self.tid, self.stream)
--        check_ret("acl.rt.subscribe_report", ret)
--    
--    def unload(self):
--        if self.callback_func:
--            ret = acl.rt.synchronize_stream(self.stream)
--            # 2.7 取消线程注册，Stream上的回调函数不再由指定线程处理。
--            ret = acl.rt.unsubscribe_report(self.tid, self.stream)
--            self.exit_flag = True
--            ret = acl.util.stop_thread(self.tid)
--            ret = acl.rt.destroy_stream(self.stream)
--        self.freeMem()
--        self.unloadModel()  
--
--    def loadModel(self, model_path):
--        '''
--        model_size = os.path.getsize(model_path)
--        
--        work_size, weight_size, ret = acl.mdl.query_size(model_path)
--        weight_size = max(model_size,weight_size)
--        work_ptr, ret= acl.rt.malloc_host(work_size)
--        model = acl.rt.malloc_host(weight_size)
--        with open(model_path, 'rb') as file:
--            model = file.read()
--        self.model_id, ret = acl.mdl.load_from_mem_with_mem(id(model), weight_size, work_ptr, work_size, id(model), weight_size)
--        '''
--        model_add, model_size = mmap_file(model_path)
--        self.model_id, ret = acl.mdl.load_from_mem(model_add, model_size)
--        
--        #self.model_id, ret = acl.mdl.load_from_file(model_path)
--        check_ret("load model",ret)
--        munmap_func(model_add, model_size)
--        self.model_desc = acl.mdl.create_desc()
--        ret = acl.mdl.get_desc(self.model_desc, self.model_id)
--        check_ret("get model desc",ret)
--    
--    def unloadModel(self):
--        ret = acl.mdl.unload(self.model_id)
--        if self.model_desc:
--            ret  = acl.mdl.destroy_desc(self.model_desc)
--            self.model_desc = None
--
--    def allocateMem(self):
--        self.input_dataset = acl.mdl.create_dataset()
--        input_size = acl.mdl.get_num_inputs(self.model_desc)
--        for i in range(input_size):
--            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
--            buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
--            check_ret("alloc input memory",ret)
--            data = acl.create_data_buffer(buffer, buffer_size)
--            _, ret = acl.mdl.add_dataset_buffer(self.input_dataset, data)
--            check_ret("add_dataset_buffer",ret)
--            dims, ret = acl.mdl.get_input_dims(self.model_desc, i)
--            self.inputs.append({"buffer": buffer, "size": buffer_size})
--            if self.mode == 'rc':
--                data_type = acl.mdl.get_input_data_type(self.model_desc, i)
--                self.in_arrs.append(acl.util.ptr_to_numpy(buffer,tuple(dims['dims']),dtypeMp[data_type]))
--
--        self.output_dataset = acl.mdl.create_dataset()
--        output_size = acl.mdl.get_num_outputs(self.model_desc)
--        buffer_host = None
--        for i in range(output_size):
--            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
--            data_type = acl.mdl.get_output_data_type(self.model_desc, i)
--            buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
--            check_ret("alloc output memory",ret)
--            data = acl.create_data_buffer(buffer, buffer_size)
--            _, ret = acl.mdl.add_dataset_buffer(self.output_dataset, data)
--            check_ret("add_dataset_buffer",ret)
--            dims, ret = acl.mdl.get_output_dims(self.model_desc, i)
--            if self.mode == 'rc':
--                self.out_arrs.append(acl.util.ptr_to_numpy(buffer,tuple(dims['dims']),dtypeMp[data_type]))
--            else:
--                buffer_host, ret = acl.rt.malloc_host(buffer_size)
--                check_ret("alloc output host memory",ret)
--            self.outputs.append({"buffer": buffer, "size": buffer_size,'buffer_host':buffer_host,'dtype':dtype2NpType[data_type]})
--
--    def freeMem(self):
--        for item in self.input_data:
--            ret = acl.rt.free(item["buffer"])
--        ret = acl.mdl.destroy_dataset(self.input_dataset)
--        for item in self.output_data:
--            ret = acl.rt.free(item["buffer"])
--            if self.mode != 'rc':
--                ret = acl.rt.free_host(item["buffer_host"])
--        ret = acl.mdl.destroy_dataset(self.output_dataset)
--        
--    def getInputs(self):
--        return self.in_arrs # 获取输入np数组，可以直接修改
--
--    def inference(self,datas) -> List[np.ndarray]:
--        acl.rt.set_context(self.context)
--        if self.mode == 'rc':
--            for i,data in enumerate(datas):
--                self.in_arrs[i][:] = data[:] # 如果输入的np数组和in_arrs中是一个数组则不会发生拷贝
--        else:
--            for i,data in enumerate(datas):
--                bytes_data = data.tobytes()
--                np_ptr = acl.util.bytes_to_ptr(bytes_data)
--                ret = acl.rt.memcpy(self.inputs[i]["buffer"], self.inputs[i]["size"], np_ptr,self.inputs[i]["size"], ACL_MEMCPY_HOST_TO_DEVICE)
--                check_ret("memcpy", ret)
--        ret = acl.mdl.execute(self.model_id, self.input_dataset,self.output_dataset)
--        check_ret("execute", ret)
--        if self.mode == 'rc':
--            return self.out_arrs
--        inference_result = []
--        for idx,out in enumerate(self.outputs):
--            ret = acl.rt.memcpy(out['buffer_host'], out["size"],out["buffer"],out["size"],ACL_MEMCPY_DEVICE_TO_HOST)
--            bytes_out = acl.util.ptr_to_bytes(out['buffer_host'], out["size"])
--            dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, idx)
--            out_data = np.frombuffer(bytes_out, dtype=out['dtype']).reshape(dims['dims'])
--            inference_result.append(out_data)
--        return inference_result
--    
--    def inference_async(self,datas,other_args) -> List[np.ndarray]:
--        acl.rt.set_context(self.context)
--        if self.mode == 'rc':
--            for i,data in enumerate(datas):
--                self.in_arrs[i][:] = data[:]
--        else:
--            for i,data in enumerate(datas):
--                np_ptr = acl.util.bytes_to_ptr(data.tobytes())
--                ret = acl.rt.memcpy(self.inputs[i]["buffer"], self.inputs[i]["size"], np_ptr,self.inputs[i]["size"], ACL_MEMCPY_HOST_TO_DEVICE)
--                check_ret("memcpy", ret)
--        ret = acl.mdl.execute_async(self.model_id, self.input_dataset,self.output_dataset,self.stream)
--        check_ret("exec_async", ret)
--        print(f"submit exec task {other_args[1]}")
--        ret = acl.rt.launch_callback(self.callPostProcess,other_args,1,self.stream)
--        check_ret("launch callback", ret)
--
--    def _process_callback(self, args_list):
--        context, timeout = args_list
--        acl.rt.set_context(context)
--        while self.callback_interval:
--            acl.rt.process_report(timeout)
--            if self.exit_flag:
--                print("[Callback] exit acl.rt.process_report")
--                break
--
--    def callPostProcess(self,other_args):
--        print("start callback",flush=True)
--        time1 = time.time()
--        inference_result = []
--        if self.mode == 'rc':
--            inference_result = self.out_arrs
--        else:
--            for idx,out in enumerate(self.outputs):
--                ret = acl.rt.memcpy(out['buffer_host'], out["size"],out["buffer"],out["size"],ACL_MEMCPY_DEVICE_TO_HOST)
--                bytes_out = acl.util.ptr_to_bytes(out['buffer_host'], out["size"])
--                dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, idx)
--                data = np.frombuffer(bytes_out, dtype=out['dtype']).reshape(dims['dims'])
--                inference_result.append(data)
--        if not self.callback_func:
--            return
--        self.callback_func(inference_result,other_args)
--        print(f"end callback, use time: {time.time()-time1}")
-\ No newline at end of file
-diff -uNr ascend-llm/inference/inference.py ascend-llm-qwen/inference/inference.py
---- ascend-llm/inference/inference.py	2024-09-04 19:21:03.079083400 +0800
-+++ ascend-llm-qwen/inference/inference.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,154 +0,0 @@
--import numpy as np
--import os
--from typing import Any, Generator, List,Tuple,Dict
--from threading import Lock
--from session import Session
--from config import InferenceConfig
--
--class LlamaInterface:
--    def __init__(self,config:InferenceConfig) -> None:
--        self.max_length = config.max_length
--        from transformers import AutoTokenizer
--        self.tokenizer=AutoTokenizer.from_pretrained(config.tokenizer)
--        self.sampling_method=config.sampling_method
--        self.sampling_value = config.sampling_value
--        self.temperature=config.temperature
--        self.session=Session.fromConfig(config)
--        self.prompt=config.prompt
--        self.state:dict[str,Any] = {"code":200,"isEnd":False,"message":""}        
--        self.first=True
--        self.stop_mp = {"[|Human|]":6,"[|AI|]":5,"<|assistant|>":6,"<|user|>":5,"<|system|>":5}
--        self.stop_words = ["<|user|>","<|assistant|>","<|system|>","[|AI|]","[|Human|]"]
--        self.model_type = config.model_type
--        self.last_output=""
--        self.lock = Lock()
--        self.reset()
--        print("init success")
--
--    def generate_cache(self,prompt:str):
--        if len(prompt) == 0 :
--            return
--        input_ids = np.asarray(self.encode(prompt,add_bos_token=self.first),dtype=np.int64).reshape(1,-1)
--        self.first=False
--        logits = self.session.run(input_ids)[0]
--        return self.sample_logits(logits[0][-1:],self.sampling_method,self.sampling_value,self.temperature),logits
--
--    def sample_logits(
--        self,
--        logits: np.ndarray,
--        sampling_method: str = "greedy",
--        sampling_value: float = None,
--        temperature: float = 1.0,
--    ) -> np.ndarray:
--        if temperature == 0 or sampling_method == "greedy":
--            next_token = np.argmax(logits, axis=-1).astype(np.int64)
--
--        elif sampling_method == "top_k" or sampling_method == "top_p":
--            assert sampling_value is not None
--            logits = logits.astype(np.float32)
--            logits /= temperature
--            probs = np.exp(logits) / np.sum(np.exp(logits))
--            sorted_probs = np.sort(probs)[:, ::-1]
--            sorted_indices = np.argsort(probs)[:, ::-1]
--
--            if sampling_method == "top_k":
--                index_of_interest = int(sampling_value)
--            elif sampling_method == "top_p":
--                p = sampling_value
--                cumulative_probs = np.cumsum(sorted_probs, axis=-1)
--                for index_of_interest, cumulative_prob in enumerate(
--                    cumulative_probs[0]
--                ):
--                    if cumulative_prob > p:
--                        break
--
--            probs_of_interest = sorted_probs[:, : index_of_interest + 1]
--            indices_of_interest = sorted_indices[:, : index_of_interest + 1]
--            probs_of_interest /= np.sum(probs_of_interest)
--            next_token = np.array(
--                [np.random.choice(indices_of_interest[0], p=probs_of_interest[0])]
--            )
--        else:
--            raise Exception(f"Unknown sampling method {sampling_method}")
--
--        return next_token
--
--    
--    def format_last_output(self):
--        if len(self.last_output) == 0:
--            return 
--        text_format = self.apply_chat_template([{"role":"assistant","content":self.last_output}])
--        self.generate_cache(text_format[len(self.last_output):])
--        self.last_output = ""
--    
--    def predict(self, text):
--        with self.lock:
--            self.state['isEnd'],self.state['message'] = False,""        
--        if text == "":
--            return
--        self.format_last_output()
--        text = self.apply_chat_template([{"role":"user","content":text}])
--        input_ids = self.encode(text,add_bos_token=self.first)
--        input_ids = np.asarray(input_ids,dtype=np.int64).reshape(1,-1)
--        self.first,ids_list = False,[]
--        for i in range(self.max_length):
--            logits = self.session.run(input_ids)[0]
--            input_ids = self.sample_logits(logits[0][-1:], self.sampling_method, self.sampling_value, self.temperature)
--            input_ids = input_ids.reshape(1, -1)
--            if input_ids[0] == self.tokenizer.eos_token_id:
--                self.session.rollback(1) 
--                break
--            ids_list.append(input_ids[0].item())
--            text_out = self.tokenizer.decode(ids_list)
--            stop_word = is_stop_word_or_prefix(text_out,self.stop_words)
--            if stop_word != "":
--                ids_list = ids_list[:-self.stop_mp[stop_word]]
--                self.session.rollback(self.stop_mp[stop_word]) 
--                break
--            if i%3 == 0:
--                with self.lock:
--                    self.state['message']=text_out
--        self.last_output = self.tokenizer.decode(ids_list)
--        with self.lock:
--            self.state['message'],self.state['isEnd'] = self.last_output,True
--        return self.last_output
--
--    def reset(self):
--        self.first = True
--        self.last_output = ""
--        self.session.reset()
--        self.generate_cache(self.apply_chat_template(self.prompt))
--        
--    def getState(self):
--        with self.lock:
--            return self.state.copy()
--
--    def apply_chat_template(self,messages:List[Dict[str,str]]) -> str:
--        text = ""
--        if self.model_type == "llama-2-7b":
--            for message in messages:
--                if message["role"] == "user":
--                    text += f'[|Human|]\n{message["content"]}\n[|AI|]'
--                elif message["role"] == "system":
--                    text += f'[|System|]\n{message["content"]}\n'
--                else:
--                    text += f'{message["content"]}\n'
--        elif self.model_type == "tiny-llama":
--            for message in messages:
--                if message["role"] == "user":
--                    text += f'<|user|>\n{message["content"]}</s>\n<|assistant|>'
--                elif message["role"] == "system":
--                    text += f'<|system|>\n{message["content"]}</s>\n'
--                else:
--                    text += f'{message["content"]}</s>\n'
--        return text
--    
--    def encode(self,text,add_bos_token=False):
--        self.tokenizer.add_bos_token = add_bos_token
--        return self.tokenizer.encode(text)
--
--def is_stop_word_or_prefix(s: str, stop_words: list) -> int:
--    for stop_word in stop_words:
--        if s.endswith(stop_word):
--            return stop_word
--    return ""
-\ No newline at end of file
-diff -uNr ascend-llm/inference/kvcache.py ascend-llm-qwen/inference/kvcache.py
---- ascend-llm/inference/kvcache.py	2024-09-04 19:21:03.079083400 +0800
-+++ ascend-llm-qwen/inference/kvcache.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,199 +0,0 @@
--import numpy as np
--from typing import Optional,Tuple,List
--from config import InferenceConfig
--# 对KV缓存和输出输出格式进行管理
--class KVCache:
--    def __init__(self,cfg:InferenceConfig) -> None:
--        self.head_len = cfg.head_len
--        self.max_size = cfg.max_cache_size
--        self.input_pos =  0
--        self.kv_size = 0
--        self.n_layer = cfg.n_layer
--        self.kvCache = None
--        self.format=cfg.format
--        self.head_num=cfg.head_num
--        self.head_dim=cfg.head_dim
--        self.dtype=np.float16
--        self.fix_size = cfg.kvcache_fixsize
--        self.evict_len = cfg.evict_len
--        self.recent_len = cfg.recent_len
--        self.num_kv_group = cfg.num_kv_group
--        if cfg.dtype == "float16":
--            self.dtype=np.float16
--        elif cfg.dtype=="float32":
--            self.dtype=np.float32
--        self.createEmptyCache()
--
--    def createEmptyCache(self):
--        if self.format == "huggingface-tensor":
--            self.kvCache=np.zeros((self.n_layer,2,1,self.head_num,self.max_size,self.head_dim),dtype=self.dtype)
--        elif self.format == "huggingface-list":
--            self.kvCache=[]
--            for i in range(self.n_layer):
--                self.kvCache.append([np.zeros((1,self.head_num,self.max_size,self.head_dim),dtype=self.dtype),np.zeros((1,self.head_num,self.max_size,self.head_dim),dtype=self.dtype)])
--        elif self.format == 'seq_nhead_headdim':
--            self.kvCache = [np.zeros((1,self.n_layer,self.max_size,self.head_num,self.head_dim),dtype=self.dtype),np.zeros((1,self.n_layer,self.max_size,self.head_num,self.head_dim),dtype=self.dtype)]
--        elif self.format == 'nhead_seq_headdim':
--            self.kvCache = [np.zeros((1,self.n_layer,self.head_num,self.max_size,self.head_dim),dtype=self.dtype),np.zeros((1,self.n_layer,self.head_num,self.max_size,self.head_dim),dtype=self.dtype)]
--
--    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],scores:Optional[np.ndarray]=None)->None:
--        pass
--    
--    def evict(self,space_need:int):
--        pass
--
--    def getInputs(self, seq_len: int) -> List[np.ndarray]:
--        cache,mask = None,None
--        if self.fix_size:
--            cache,mask = self.kvCache, np.ones((1,self.max_size+seq_len),dtype=np.int64)
--            mask[:,self.kv_size:self.max_size] = 0
--        else:
--            cache,mask = self.kvCache[:,:,:,:,:self.kv_size], np.ones((1,self.kv_size+seq_len),dtype=np.int64)
--        pos_id =np.arange(self.input_pos,self.input_pos+seq_len,dtype=np.int64).reshape(1,-1)
--        return cache,mask,pos_id
--    
--    def reset(self):
--        self.input_pos=0
--        self.kv_size=0
--    
--    def rollback(self,seq_len):
--        self.kv_size -=seq_len
--
--    @staticmethod
--    def create(config:InferenceConfig) -> 'KVCache':
--        if config.kvcache_method == "basic":
--            return Basic(config)
--        elif config.kvcache_method == "sliding-window":
--            return SWindow(config)
--        elif config.kvcache_method == 'streamllm':
--            return StreamLLM(config)
--        elif config.kvcache_method == 'H2O':
--            return H2O(config)
--        else:
--            return None
--
--class Basic(KVCache):
--    def __init__(self, cfg: InferenceConfig) -> None:
--        super().__init__(cfg)
--        
--    def update(self, seq_len: int, newKV: Tuple[List[np.ndarray]], scores: Optional[np.ndarray] = None) -> None:
--        if seq_len + self.kv_size > self.max_size:
--            raise RuntimeError("超出KV缓存长度限制")
--        if self.format=="huggingface-tensor":
--            self.kvCache[:,:,:,:,self.kv_size:self.kv_size+seq_len,:] = newKV[:,:,:,:,0:seq_len,:]
--        self.kv_size += seq_len
--        self.input_pos+=seq_len
--
--class SWindow(KVCache):
--    def __init__(self,cfg:InferenceConfig) -> None:
--        super().__init__(cfg)
--        self.p=0
--        self.cnt = 0
--
--    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],score:Optional[np.ndarray] = None):
--        self.input_pos+=seq_len
--        cur = 0
--        while self.p + seq_len  > self.max_size:
--            self.update_part(newKV,cur,self.max_size-self.p)
--            cur += (self.max_size-self.p)
--            seq_len -= (self.max_size-self.p)
--            self.p = self.head_len
--            self.kv_size = self.max_size
--            self.cnt += 1
--        self.update_part(newKV,cur,seq_len)
--        self.p += seq_len
--        self.kv_size = max(self.p,self.kv_size)
--
--    def update_part(self,newKV:Tuple[List[np.ndarray],List[np.ndarray]],begin:int,len:int):
--        if len == 0:
--            return
--        if self.format == 'huggingface-tensor': #[n_layer,2,batch_size,head_num,len,head_dim]
--            self.kvCache[:,:,:,:,self.p:self.p+len,:] = newKV[:,:,:,:,begin:begin+len,:]	
--        if self.format=='seq_nhead_headdim': # [batch, n_layers, seq_len, n_heads, head_dim]
--            self.kvCache[0][:,:,self.p:self.p+len] = newKV[0][:,:,begin:begin+len]
--            self.kvCache[1][:,:,self.p:self.p+len] = newKV[1][:,:,begin:begin+len]
--        elif self.format=='nhead_seq_headdim':    # [batch, n_layers, n_heads, seq_len, head_dim]
--            self.kvCache[0][:,:,:,self.p:self.p+len] = newKV[0][:,:,:,begin:begin+len]
--            self.kvCache[1][:,:,:,self.p:self.p+len] = newKV[1][:,:,:,begin:begin+len]
--        elif self.format=='huggingface-list': # (n_layer,2) * [batch_size,head_num,len,head_dim]
--            for i in range(self.n_layer):
--                self.kvCache[i][0][:,:,self.p:self.p+len,:] = newKV[i][0][:,:,begin:begin+len,:]	
--                self.kvCache[i][1][:,:,self.p:self.p+len,:] = newKV[i][1][:,:,begin:begin+len,:]
--    
--    def reset(self):
--        self.p=0
--        return super().reset()
--    
--    def rollback(self, seq_len):
--        if self.cnt != 0:
--            self.p -= seq_len
--            self.cnt -= 1
--            if self.p < self.head_len:
--                self.p = self.max_size - (self.head_len - self.p) + 1
--            if self.cnt == 0:
--                self.kv_size = self.p
--        else:
--            self.p -= seq_len
--            self.kv_size -= seq_len
--
--class StreamLLM(KVCache):
--    def __init__(self,cfg:InferenceConfig):
--        super().__init__(cfg)
--    
--    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],score:Optional[np.ndarray] = None):
--        if self.kv_size + seq_len >= self.max_size:
--            self.evict(self.evict_len)
--        self.input_pos += seq_len
--        self.kvCache[:,:,:,:,self.kv_size:self.kv_size+seq_len] = newKV
--        self.kv_size += seq_len
--
--    def evict(self, space_need: int):
--        self.kvCache[:,:,:,:,self.head_len:self.kv_size -space_need] = \
--            self.kvCache[:,:,:,:,self.head_len+space_need:self.kv_size]
--        self.kv_size -= space_need
--
--class H2O(KVCache):
--    def __init__(self,cfg:InferenceConfig) -> None:
--        super().__init__(cfg)
--        self.scores = np.zeros((self.n_layer,1,self.head_num,self.max_size),dtype=self.dtype)
--        self.idx_head = np.arange(0,self.head_num,dtype=np.int32).reshape(-1,1)
--    
--    def update(self,seq_len:int,newKV:Tuple[List[np.ndarray],List[np.ndarray]],score:Optional[np.ndarray] = None):
--        # score [n_layer,batch,nheader,input_len,all_len]
--        if self.num_kv_group != 1:
--            score = score.reshape(self.n_layer,1,self.num_kv_group,self.head_num,seq_len,-1).sum(axis=2)
--        elif not score.flags.writeable:
--            score = score.copy() # acl 返回的ndarray不可写
--        score[:,:,:,:,self.kv_size:self.kv_size+seq_len] = score[:,:,:,:,-seq_len:]
--        if self.kv_size + seq_len > self.max_size:
--            self.o_score = score
--            self.evict(self.evict_len)
--            self.o_score,score = None,self.o_score
--        self.input_pos += seq_len
--        self.kvCache[:,:,:,:,self.kv_size:self.kv_size+seq_len] = newKV
--        for i in range(seq_len):
--            self.update_score_one(score[:,:,:,i])
--                
--    def update_score_one(self,score:Optional[np.ndarray] = None):
--        self.kv_size += 1
--        self.scores[:,:,:,:self.kv_size] = self.scores[:,:,:,:self.kv_size] * 0.5 + score[:,:,:,:self.kv_size]
--        
--    def evict(self, space_need):
--        r_len = self.kv_size - space_need - self.head_len -self.recent_len # 对前head len个KV缓存进行保留
--        new_seq = self.o_score.shape[-2]
--        for i in range(self.n_layer):
--            idx=np.argpartition(-self.scores[i,0,:,self.head_len:self.kv_size-self.recent_len],r_len,axis=-1)[:,:r_len]
--            for j in range(2):
--                self.kvCache[i,j,0,:,self.head_len:self.head_len+r_len] = self.kvCache[i,j,0,self.idx_head,idx]
--                self.kvCache[i,j,0,:,self.head_len+r_len:self.kv_size-space_need] = \
--                    self.kvCache[i,j,0,:,self.kv_size-self.recent_len:self.kv_size]
--            self.scores[i,0,:,self.head_len:r_len+self.head_len] = self.scores[i,0,self.idx_head,idx]
--            self.scores[i,0,:,self.head_len+r_len:self.kv_size-space_need] = \
--                self.scores[i,0,:,self.kv_size-self.recent_len:self.kv_size]
--            for j in range(new_seq):
--                self.o_score[i,0,:,j,self.head_len:r_len+self.head_len] = self.o_score[i,0,self.idx_head,j,idx]
--                self.o_score[i,0,:,j,self.head_len+r_len:self.kv_size+new_seq-space_need] = \
--                    self.o_score[i,0,:,j,self.kv_size-self.recent_len:self.kv_size+new_seq]
--            self.scores[i,0,:,r_len+self.head_len+self.recent_len:] =  0
--        self.kv_size = r_len + self.head_len + self.recent_len
--        # self.head_len + r_len + self.recent_len + new_seq
-diff -uNr ascend-llm/inference/main.py ascend-llm-qwen/inference/main.py
---- ascend-llm/inference/main.py	2024-09-04 19:21:03.080078900 +0800
-+++ ascend-llm-qwen/inference/main.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,94 +0,0 @@
--import argparse
--import sys
--from concurrent.futures import ThreadPoolExecutor
--from config import InferenceConfig
--from inference import LlamaInterface
--
--def main(cli:bool,engine:LlamaInterface):
--    if cli:
--        while True:
--            line = input()
--            print(engine.predict(line))
--    from flask import Flask, request, jsonify
--    from flask import render_template  # 引入模板插件
--    from flask_cors import CORS
--    pool = ThreadPoolExecutor(max_workers=2)        
--    app = Flask(
--        __name__,
--        static_folder='./dist',  # 设置静态文件夹目录
--        template_folder="./dist",
--        static_url_path=""
--    )
--
--    CORS(app, resources=r'/*')
--    
--    @app.route('/')
--    def index():
--        return render_template('index.html', name='index')
--
--    @app.route("/api/chat", methods=["POST"])
--    def getChat():
--        msg = request.get_json(force=True)['message']
--        if len(msg) == 0:
--            return jsonify({"code": 404})
--        pool.submit(engine.predict,msg)
--        return jsonify({"code": 200})
--
--    @app.route("/api/getMsg", methods=["GET"])
--    def getMsg():
--        return jsonify(engine.getState())
--    
--    @app.route("/api/reset", methods=["GET"])
--    def reset():
--        engine.reset()
--        return jsonify({"code": 200})
--
--    app.run(
--        use_reloader=False,
--        host="0.0.0.0",
--        port=5000
--    )
--
--if __name__ == '__main__':
--    parser = argparse.ArgumentParser()
--    parser.add_argument(
--        '--cli', dest='cli', default=False, action='store_true',
--        help="run web ui by default, if add --cli, run cli."
--    )
--    parser.add_argument("--kv_size", type=int, default=256)
--    parser.add_argument(
--        "--engine", type=str, default="acl",
--        help="inference backend, onnx or acl"
--    )
--    parser.add_argument(
--        "--sampling", type=str, default="top_k",
--        help="sampling method, greedy, top_k or top_p"
--    )
--    parser.add_argument(
--        "--sampling_value",type=float,default=10,
--        help="if sampling method is seted to greedy, this argument will be ignored; if top_k, it means value of p; if top_p, it means value of p"
--    )
--    parser.add_argument(
--        "--temperature",type=float,default=0.7,
--        help="sampling temperature if sampling method is seted to greedy, this argument will be ignored."
--    )
--    parser.add_argument(
--        "--hf-dir", type=str, default="/root/model/tiny-llama-1.1B", 
--        help="path to huggingface model dir"
--    )
--    parser.add_argument(
--        "--model", type=str, default="/root/model/tiny-llama-seq-1-key-256-int8.om", 
--        help="path to onnx or om model"
--    )
--    args = parser.parse_args()
--    cfg = InferenceConfig(
--        hf_model_dir=args.hf_dir,
--        model=args.model,
--        max_cache_size=args.kv_size,
--        sampling_method=args.sampling,
--        sampling_value=args.sampling_value,
--        temperature=args.temperature,
--        session_type=args.engine,
--    )
--    engine = LlamaInterface(cfg)
--    main(args.cli,engine)
-\ No newline at end of file
-diff -uNr ascend-llm/inference/readme.md ascend-llm-qwen/inference/readme.md
---- ascend-llm/inference/readme.md	2024-09-04 19:21:03.080078900 +0800
-+++ ascend-llm-qwen/inference/readme.md	1970-01-01 08:00:00.000000000 +0800
-@@ -1,16 +0,0 @@
--# inference
--
--目前提供两种运行模式：
--1. cli模式：在终端运行，每一次输入一行，一次性返回所有的推理结果。
--2. web模式：前端代码在[github](https://github.com/yinghuo302/ascend-llm-web)或者[gitee](https://gitee.com/yinghuo302/ascend-llm-web)，打包出dist文件夹，放在inference文件夹下即可。
--
--```bash
--cd inference
--python main.py \
--	--model <path_to_onnx_or_om_model> \
--	--hf-dir <path_to_huggingface_model_dir> \ # 需要tokenizer和模型配置文件，权重不需要
--	--engine <acl/onnx> 
--	--sampling <greedy/top_p/top_k> --sampling_value <>  --temperature <> # 采样相关配置
--	--cli # 添加--cli表示在终端运行
--```
--代码需要修改的部分主要在与config.py，可以根据注释修改。inference.py中关于输入格式和结束语判断的部分可能也需要根据具体的模型修改。
-\ No newline at end of file
-diff -uNr ascend-llm/inference/requirements.txt ascend-llm-qwen/inference/requirements.txt
---- ascend-llm/inference/requirements.txt	2024-09-04 19:21:03.080078900 +0800
-+++ ascend-llm-qwen/inference/requirements.txt	1970-01-01 08:00:00.000000000 +0800
-@@ -1,6 +0,0 @@
--# onnxruntime or acl
--onnxruntime
--numpy
--transformers
--flask
--flask_cors
-\ No newline at end of file
-diff -uNr ascend-llm/inference/session.py ascend-llm-qwen/inference/session.py
---- ascend-llm/inference/session.py	2024-09-04 19:21:03.080078900 +0800
-+++ ascend-llm-qwen/inference/session.py	1970-01-01 08:00:00.000000000 +0800
-@@ -1,87 +0,0 @@
--from config import InferenceConfig
--from kvcache import KVCache
--import numpy as np
--from typing import List
--import time
--import sys
--class Session:
--	def __init__(self,config:InferenceConfig) -> None:
--		self.kvCache = KVCache.create(config)
--		self.max_len = config.max_input_len
--
--	def run(self,input_ids:np.ndarray):
--		pass
--	
--	@staticmethod
--	def fromConfig(config:InferenceConfig) -> 'Session':
--		if config.session_type == "onnx":
--			return OnnxSession(config)
--		elif config.session_type=='acl':
--			return AclSession(config)
--		else:
--			return None
--	
--	def reset(self):
--		self.kvCache.reset()
--
--	def rollback(self,seq_len):
--		self.kvCache.rollback(seq_len)
--
--	def evict(self,space_need):
--		self.kvCache.evict(space_need)
--	
--class OnnxSession(Session):
--	def __init__(self,config:InferenceConfig)->None:
--		super().__init__(config)
--		import onnxruntime
--		options = onnxruntime.SessionOptions()
--		self.llm_session = onnxruntime.InferenceSession(
--            config.model,
--            sess_options=options,
--            providers=[
--                "DmlExecutionProvider",
--                "CUDAExecutionProvider",
--                "CPUExecutionProvider",
--            ],
--        )
--
--	def run(self,input_ids:np.ndarray):
--		seq_len=input_ids.shape[-1]
--		l,r,result = 0,self.max_len,None
--		while l < seq_len:
--			r = min(seq_len,r)
--			cache,mask,pos_ids = self.kvCache.getInputs(r-l)
--			result = self.llm_session.run(None,{
--				"input_ids": input_ids[:,l:r],
--				"attention_mask":mask,
--				"past_key_values": cache,
--				"position_ids": pos_ids,
--			})
--			# result:  [logits,key_values,attn_scores]
--			self.kvCache.update(r-l,result[1],result[2])
--			l , r = l+self.max_len , r + self.max_len
--		return result
--
--class AclSession(Session):
--	context = None
--	def __init__(self,config:InferenceConfig)->None:
--		super().__init__(config)
--		from engine import ACLModel,initResource
--		self.context = initResource(config.device)
--		self.model = ACLModel(config.model,context=self.context,mode=config.acl_mode)
--		self.input_ids = np.zeros((1,self.max_len),dtype=np.int64)
--		if config.acl_mode == 'rc':
--			self.input_ids,_,_,self.kvCache.kvCache = self.model.getInputs()
--
--	def run(self,input_ids:np.ndarray):
--		seq_len=input_ids.shape[-1]
--		l,r,result = 0,self.max_len,None
--		while l < seq_len:
--			r = min(seq_len,r)
--			self.input_ids[:,:r-l] = input_ids[:,l:r]
--			cache,mask,pos_ids = self.kvCache.getInputs(self.max_len)
--			result:List[np.ndarray] = self.model.inference([self.input_ids,mask,pos_ids,cache])
--			# result:  [logits,key_values,attn_scores]
--			self.kvCache.update(r-l,result[1],result[2])
--			l , r = l+self.max_len , r + self.max_len
--		return result
-\ No newline at end of file
-diff -uNr ascend-llm/readme.md ascend-llm-qwen/readme.md
---- ascend-llm/readme.md	2024-09-04 19:21:03.080078900 +0800
-+++ ascend-llm-qwen/readme.md	1970-01-01 08:00:00.000000000 +0800
-@@ -1,135 +0,0 @@
--# ascend-llm
--
--## 简介
--
--本项目基于昇腾310芯片部署大语言模型，目前已经成功运行meta-llama/Llama-2-7b-hf和TinyLlama/TinyLlama-1.1B-Chat-v1.0。
--
--本实践项目由南京大学计算机科学与技术系杜骋同学主导，由朱光辉老师进行指导，由昇腾CANN生态使能团队提供技术支持，并在昇腾开发者大会2024进行了展示。
--
--## 效果预览
--
--![](./assets/webui.png)
--
--
--## 关键技术
--- 静态图方案
--
--    在Transformer模型中，基于模型的自回归推理特性，业界普遍采用kvcache缓存的方式增加模型的推理性能。kvcache会缓存上一次推理得到的kv矩阵用于本次推理，大大减少了推理计算量。
--    
--    由于缓存的kv矩阵要和当前输入字符计算出的kv矩阵进行拼接，因此每次推理完整的kv矩阵长度一直在增加，致使模型shape不固定，会走动态推理流程，存在大量算子编译时间，推理性能大大下降。
--    
--    本方案基于原先动态图方案，将kv矩阵固定到一个最大长度，结合attention_mask屏蔽输入序列部分位置的特性实现了静态图的方案。在kvcache达到上限时通过KV缓存驱逐（[StreamingLLM](https://arxiv.org/abs/2309.17453)和[Heavy-Hitter Oracle](https://arxiv.org/abs/2306.14048)）让模型可以反复推理。
--
--- 量化方案
--
--    大模型权重过大，在端侧设备由于内存限制通常难以运行，因此通常将大模型权重从fp16量化到int8甚至int4降低内存消耗.
--
--    本项目采用平滑激活（[SmoothQuant](https://arxiv.org/abs/2211.10438)），动态混合精度分解（类似[LLM.int8](https://arxiv.org/abs/2208.07339)），静态混合精度分解量化方案，通过对权重和激活值均采用int8量化，显著节省了内存并提升了推理速度。
--
--
--## 运行方式
--
--### 环境准备
--
--1. 昇腾软硬件解决方案(驱动+固件+CANN)
--   
--   前往[昇腾社区](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha002/cannquickstart/quickstart/instg_000021.html)，按照说明下载安装。
--   或者下载[香橙派0318镜像](https://www.hiascend.com/forum/thread-0231149828762292018-1-1.html)，烧录到sd卡，启动环境，参考[香橙派AIpro快速上手指南](https://www.hiascend.com/forum/thread-0260140249549075069-1-1.html)。 
--2. 第三方依赖
--   
--   模型导出和推理相关文件夹下requirements.txt，使用pip 进行安装。
--   
--   ```shell
--   pip install -r requirements.txt
--   ```
--
--本项目测试环境：香橙派AI pro，CANN 7.0/7.2，python 3.9。
--
--### 算子适配
--
-- - protoc安装
--	
--	根据昇腾文档选择合适的protoc版本，protoc版本和CANN版本强相关。CANN7.0/7.2使用的protoc 1.13.0
--    
--	```
--    # 安装protoc==1.13.0， 找一空闲目录下载
--    wget  https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/wanzutao/tiny-llama/protobuf-all-3.13.0.tar.gz --no-check-certificate
--    tar -zxvf protobuf-all-3.13.0.tar.gz
--    cd protobuf-3.13.0
--    apt-get update
--    apt-get install autoconf automake libtool
--    ./autogen.sh 
--    ./configure
--    make -j4
--    make install
--    sudo ldconfig
--    protoc --version # 查看版本号
--    ```
--
-- - 算子编译部署
--    ```
--    # 将./custom_op/matmul_integer_plugin.cc 拷贝到指定路径
--    cd tiny_llama
--    export ASCEND_PATH=/usr/local/Ascend/ascend-toolkit/latest
--    cp custom_op/matmul_integer_plugin.cc $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx/framework/onnx_plugin/
--    cd $ASCEND_PATH/tools/msopgen/template/custom_operator_sample/DSL/Onnx 
--    ```
--    打开build.sh，找到下面四个环境变量，解开注释并修改如下：
--    ```
--    export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/include
--    export TOOLCHAIN_DIR=/usr
--    export AICPU_KERNEL_TARGET=cust_aicpu_kernels
--    export AICPU_SOC_VERSION=Ascend310B4
--    ```
-- - 编译运行
--    ```
--    ./build.sh 
--    cd build_out/
--    ./custom_opp_ubuntu_aarch64.run
--    # 生成文件到customize到默认目录 $ASCEND_PATH/opp/vendors/，删除冗余文件
--    cd $ASCEND_PATH/opp/vendors/customize
--    rm -rf op_impl/ op_proto/
--    ```
--
--### 模型量化与导出
--
--导出的模型可以从[阿里云盘](https://www.alipan.com/s/ro1NDLjFxtf)中下载
--
--1. 导出onnx：将transformer库中的modeling_llama替换为export_llama文件下的[modeling_llama](./export_llama/modeling_llama_4.35.py)。通过一下命令将模型导出为onnx（相对路径均为相对export_llama.py文件）
--	```bash
--	python export_llama.py \
--		--model <model_name_or_path> \
--		--output <output_onnx_file_path> \
--		--act-path <act_scales_file_path>
--		--quant <quant_config_file_path>
--	```
--	模型量化具体见[readme](./export_llama/readme.md)。对于TinyLlama-1.1B建议采用per-token的absmax量化（即w8x8.py）或者平滑激活（即smooth.py）；对于Llama-2-7b-hf，建议采用静态混合精度分解（即sd.py）或者平滑激活+静态混合精度分解(即smsd.py)。已经测试的方案为TinyLlama-1.1B per-token的absmax量化，Llama-2-7b-hf 静态混合精度分解。
--3. ATC模型转换
--	``` bash
--	atc --framework=5 --model="xxx.onnx"  --output="xxx" --input_format=ND --input_shape="input_ids:batch,seq_len;attention_mask:batch,seq_len+kv_len;position_ids:batch,seq_len;past_key_values:n_layer,2,batch,n_head,kv_len,head_dim" --log=debug --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype
--	```
--	上述的n_layer, n_head, head_dim变量由模型决定。对于Llama-2-7b，n_layer=32, n_head=32, head_dim=128；对于TinyLlama-1.1B，n_layer=22, n_head=4, head_dim=64
--	
--	对于batch, seq_len, kv_len, 请根据需要填入，建议设置batch=1, seq_len=1, kv_len=1024。如对于TinyLlama-1.1B
--	
--	```bash
--	atc --framework=5 --model="./tiny-llama.onnx"  --output="tiny-llama" --input_format=ND --input_shape="input_ids:1,1;attention_mask:1,1025;position_ids:1,1;past_key_values:22,2,1,4,1024,64" --log=debug --soc_version=Ascend310B1 --precision_mode=must_keep_origin_dtype
--	```
--	
--	对于Llama-2-7b，ATC转换占用内存较大，建议采用其他设备转换，如采用香橙派进行模型转换可以`export MAX_COMPILE_CORE_NUMBER=1`和`export TE_PARALLEL_COMPILER=1`，并开swap分区（推理时请关闭swap，会影响性能）。
--
--### 模型推理运行 
--
--目前提供两种运行模式：
--1. cli模式：在终端运行，每一次输入一行，一次性返回所有的推理结果。
--2. web模式：前端代码在[github](https://github.com/yinghuo302/ascend-llm-web)或者[gitee](https://gitee.com/yinghuo302/ascend-llm-web)，打包出dist文件夹，放在inference文件夹下即可。
--
--```bash
--cd inference
--python main.py \
--	--model <path_to_onnx_or_om_model> \
--	--hf-dir <path_to_huggingface_model_dir>  \ # 需要tokenizer和模型配置文件，权重不需要 
--	--engine <acl/onnx> \
--	--sampling <greedy/top_p/top_k> --sampling_value <>  --temperature <> \ #采样相关配置
--	--cli # 添加--cli表示在终端运行
--```
++    max_cache_size:int=1024 # kvcache的最大长度
+     head_num:int=4
+     num_kv_group:int = 8 # for GQA
+     head_dim:int=64
\ No newline at end of file
-- 
Gitee


From b58a70355c96d992f59e714d20504f3a21e0f2e7 Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Thu, 5 Sep 2024 07:24:46 +0000
Subject: [PATCH 6/9] update
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
index 4481af301c..84e6d03e70 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/diff_model.patch
@@ -1,5 +1,5 @@
 --- modeling_qwen2.py	2024-09-04 22:30:47.490111800 +0800
-+++ modeling_qwen2_export.py	2024-09-04 22:49:20.540908500 +0800
++++ modeling_qwen2_export.py	2024-09-04 22:49:20.540908500 +0800 
 @@ -162,6 +162,10 @@
          self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
  
-- 
Gitee


From 86f9e1c1bdc6a12fd756a0e9fa7e0ccbacc0494b Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Thu, 5 Sep 2024 07:25:07 +0000
Subject: [PATCH 7/9] update
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
index 62b2560a86..16613f2119 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/requirements.txt
@@ -2,4 +2,4 @@ torch==2.1.0
 torch_npu
 onnx
 transformers==4.38.2
-lm-eval==0.4.2 # for eval
\ No newline at end of file
+lm-eval==0.4.2 # for eval
-- 
Gitee


From 7697f50c9b1e1e7ccc5829ab223bbb2daa3b03de Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Thu, 5 Sep 2024 07:25:33 +0000
Subject: [PATCH 8/9] update ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
index f1068d88f6..72fef61f64 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
@@ -11,7 +11,7 @@ def main(cli: bool, engine: LlamaInterface, dataset):
         elif dataset == 'GSM8K':
             engine.test_gsm8k()
         else:
-            print("dataset is not support!")
+            print("dataset is not support! ")
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-- 
Gitee


From 8e2315de00a583b5eb53223caa5f2b002e43666a Mon Sep 17 00:00:00 2001
From: linsicong <linsicong@huawei.com>
Date: Fri, 6 Sep 2024 01:55:10 +0000
Subject: [PATCH 9/9] update ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py.

Signed-off-by: linsicong <linsicong@huawei.com>
---
 .../built-in/nlp/Qwen_for_Pytorch/test.py     | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
index 72fef61f64..b285cecce8 100644
--- a/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
+++ b/ACL_PyTorch/built-in/nlp/Qwen_for_Pytorch/test.py
@@ -1,3 +1,32 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2017, 
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#  list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#  contributors may be used to endorse or promote products derived from
+#  this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import argparse
 from config import InferrnceConfig
 from inference import LlamaInterface
-- 
Gitee