diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/README.md b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..35734aae9e3be8a0f0ccd505689775dbc14dc524
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md
@@ -0,0 +1,52 @@
+# Baichuan-2-7B 
+
+## Description
+Baichuan 2 is a new generation open-source large language model launched by Baichuan Intelligence. It is trained on high-quality data with 26 trillion tokens, which sounds like a substantial dataset. Baichuan 2 achieves state-of-the-art performance on various authoritative Chinese, multilingual, and domain-specific benchmarks of similar size, indicating its excellent capabilities in language understanding and generation.This release includes Base and Chat versions of 7B. 
+
+## Setup
+
+### Install
+In order to run the model smoothly, we need the following dependency files:
+1. ixrt-xxx.whl
+2. ixformer-xxx.whl
+3. vllm-xxx.whl
+Please contact the staff to obtain the relevant installation packages.
+
+```bash
+yum install mesa-libGL
+pip3 install transformers==4.33.2
+pip3 install Path/To/ixrt-xxx.whl
+pip3 install Path/To/vllm-xxx.whl
+pip3 install Path/To/ixformer-xxx.whl
+```
+
+### Download
+Pretrained model: <https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main>
+
+```bash
+mkdir /data/baichuan/
+mv Baichuan2-7B-Base.tar/zip /data/baichuan/
+```
+
+
+## Run model
+
+```bash
+python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/ --chat_template template_baichuan.jinja --trust-remote-code
+```
+
+## Run Baichuan w8a16 quantization
+
+### Retrieve int8 weights
+
+Int8 weights will be saved at /data/baichuan/Baichuan2-7B-Base/int8
+```bash
+python3 convert2int8.py --model-path /data/baichuan/Baichuan2-7B-Base/
+```
+
+### Run
+
+```bash
+python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --trust-remote-code --max-num-seqs 1 --max-model-len 256 \    
+                             --trust-remote-code --tensor-parallel-size 2 --temperature 0.0
+```
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py b/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py
new file mode 100644
index 0000000000000000000000000000000000000000..a244476159c5762f1b0fb5cceea86c50bd7c9066
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import os
+import sys
+from collections import OrderedDict
+import argparse
+import glob
+import shutil
+import json
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model-path",type=str,default=None)
+args = parser.parse_args()
+
+
+def float2int8(load_path, save_path):
+    all_files = glob.glob(os.path.join(load_path,'*'))
+    os.makedirs(save_path)
+    print(f"save int8 weight to: {save_path}")
+    for raw_file in all_files:
+        ext_name = os.path.splitext(raw_file)[-1]
+        if ext_name in ['.json', '.py', '.model']:
+            dst_file = os.path.split(raw_file)[-1]
+            dst_file = os.path.join(save_path, dst_file)
+            shutil.copy(raw_file, dst_file)
+            print(f"copy file `{raw_file}` to `{dst_file}`")
+        elif ext_name == ".bin":
+            print(f"quantize `{raw_file}`")
+            params = torch.load(raw_file,map_location="cpu")
+            new_params = OrderedDict()
+            keys = ['proj','pack']
+            for k,v in params.items():
+                find_key = False
+                for key in keys:
+                    if key in k:
+                        scale = torch.abs(v).max(dim=-1)[0] / 127.0
+                        int8_v = torch.clamp(v / scale.view(-1,1),min=-127,max=127).to(torch.int8).contiguous()
+                        scale = scale.view(1,-1).contiguous()
+                        new_params[k] = int8_v
+                        new_params[k.replace("weight","scales")] = scale
+                        find_key = True
+                        break
+                if find_key:
+                    continue
+                # save the other param
+                new_params[k] = v
+            file_name = os.path.basename(raw_file)
+            file_name_no_suffix = file_name.rsplit('.',1)[0]
+            new_file_name = file_name_no_suffix+"_int8.bin"
+            torch.save(new_params,os.path.join(save_path,new_file_name))
+
+    config_file = os.path.join(save_path, "w8a16_config.json")
+    with open(config_file, 'w') as f:
+        f.write(json.dumps({}))
+
+if __name__ == "__main__":
+    model_path = args.model_path
+    save_path = os.path.join(model_path, "int8")
+    if os.path.isdir(save_path):
+        shutil.rmtree(save_path)
+    float2int8(model_path, save_path)
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d01a7916f3e6e7f66e3dda6c963679e82b96085
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+from utils import build_chat,post_process,load_chat_template,sampling_add_cli_args
+
+import logging
+import time
+import argparse
+import dataclasses
+import inspect
+
+import torch
+from vllm import LLM, SamplingParams, EngineArgs
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--chat_template",type=str,default=None)
+parser.add_argument("--remove_chat_template",default=True,action="store_false",help="pass this if you are not use a chat model")
+parser = EngineArgs.add_cli_args(parser)
+parser = sampling_add_cli_args(parser)
+args = parser.parse_args()
+
+engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+sampling_args = [param.name for param in list(inspect.signature(SamplingParams.__init__).parameters.values())[1:]]
+engine_params = {attr:getattr(args, attr) for attr in engine_args}
+sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
+
+model_name = args.model.strip()
+model_name = model_name if args.model[-1]!='/' else model_name[:-1]
+model_name = model_name.rsplit('/')[-1]
+
+
+# Sample prompts.
+prompts = [
+            "哪些迹象可能表明一个人正在经历焦虑?", 
+            "描述一下如何制作芝士披萨。", 
+            "写一篇有关5G网络研发的综述文章。"
+           ]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(**sampling_params)
+
+# Create an LLM.
+llm = LLM(**engine_params)
+
+# process chat template
+if not args.remove_chat_template:
+    if 'chat' not in model_name.lower():
+        logging.warning(f"We assume that you are using the chat model, so additional processing is required for the input prompt. "
+                        f"If the result is not quite correct, please ensure that the model path includes the chat character. "
+                        f"for now, the model_name from model path is {model_name}")
+    prompts_new = prompts
+else:
+    # Build chat model promopt
+    # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+    # For some old models, the default template may cause bad answers. we don't consider this situation, 
+    # because the Transformers team is advancing the chat template. For more informatino about it, 
+    # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+    try:
+        load_chat_template(llm.get_tokenizer(),args.chat_template)
+        prompts_new = []
+        for prompt in prompts:
+            messages = [
+                {"role": "user", "content": prompt}
+            ]
+            text = llm.get_tokenizer().apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            prompts_new.append(text)
+    except:
+        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.37.0)")
+        # Fall back to simple build chat, this part should be controled by model developer, we just provide a simple use cases
+        prompts_new = [build_chat(llm.get_tokenizer(),prompt,model_name,max_length=args.max_generate_tokens) for prompt in prompts]
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
+torch.cuda.synchronize()
+
+start_time = time.perf_counter()
+outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
+torch.cuda.synchronize()
+end_time = time.perf_counter()
+duration_time = end_time - start_time
+
+num_tokens = 0
+# Print the outputs.
+for i, output in enumerate(outputs):
+    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
+    generated_text = post_process(output.outputs[0].text,model_name)
+    
+    num_tokens += len(output.outputs[0].token_ids)
+    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+
+# 0.3.2 tokens: 757, QPS: 97.97229589080902
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a1812a6c09ab127ffd7fbe60fb9617de90f292c7
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
@@ -0,0 +1,22 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+<reserved_106>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+<reserved_107>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+<reserved_107>
+{% endif %}
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc072d8d51e35109a97c17b5476e7bf3aa1448b
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from copy import deepcopy
+from typing import Tuple, List, Union
+
+import codecs
+import logging
+import argparse
+
+# 对于chat模型，或者模型需要特定的输入，需要对prompt进行额外的处理。
+# 如果您在使用中有额外的prompt处理方式需求或者错误反馈，可以联系王坚或者巩亚飞，我们会对modelzoo进行更新适配。
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+        if chat_template is not None:
+            try:
+                with open(chat_template, "r") as f:
+                    tokenizer.chat_template = f.read()
+            except OSError:
+                # If opening a file fails, set chat template to be args to
+                # ensure we decode so our escape are interpreted correctly
+                tokenizer.chat_template = codecs.decode(
+                    chat_template, "unicode_escape")
+
+            logging.info(
+                f"Using supplied chat template:\n{tokenizer.chat_template}"
+            )
+        elif tokenizer.chat_template is not None:
+            logging.info(
+                f"Using default chat template:\n{tokenizer.chat_template}"
+            )
+        else:
+            logging.warning(
+                "No chat template provided. Chat API will not work.")
+
+def default_build_chat(tokenizer,prompt):
+    return prompt
+
+def chatglm2_build_chat(tokenizer,prompt):
+    return tokenizer.build_prompt(prompt)
+
+def chatglm3_build_chat(tokenizer,prompt):
+    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
+
+def llama2_build_chat(tokenizer,prompt):
+    return f"[INST]{prompt}[/INST]"
+
+# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
+def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
+    def _parse_messages(messages, split_role="user"):
+        system, rounds = "", []
+        round = []
+        for i, message in enumerate(messages):
+            if message["role"] == "system":
+                assert i == 0
+                system = message["content"]
+                continue
+            if message["role"] == split_role and round:
+                rounds.append(round)
+                round = []
+            round.append(message)
+        if round:
+            rounds.append(round)
+        return system, rounds
+
+    messages = [{"role": "user", "content": f"{prompt}"}]
+    max_new_tokens = max_new_tokens
+    max_input_tokens = 4096 - max_new_tokens
+    system, rounds = _parse_messages(messages, split_role="user")
+    system_tokens = tokenizer.encode(system)
+    max_history_tokens = max_input_tokens - len(system_tokens)
+
+    history_tokens = []
+    for round in rounds[::-1]:
+        round_tokens = []
+        for message in round:
+            if message["role"] == "user":
+                round_tokens.append(195)
+            else:
+                round_tokens.append(196)
+            round_tokens.extend(tokenizer.encode(message["content"]))
+        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
+            history_tokens = round_tokens + history_tokens  # concat left
+            if len(history_tokens) < max_history_tokens:
+                continue
+        break
+
+    input_tokens = system_tokens + history_tokens
+    if messages[-1]["role"] != "assistant":
+        input_tokens.append(196)
+    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
+    return input_tokens
+
+def qwen_build_chat(
+    tokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set()
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            response_text, response_tokens_part = _tokenize_str(
+                "assistant", turn_response
+            )
+            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+            prev_chat = (
+                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+            )
+
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+
+    return raw_text, context_tokens
+
+def codellama_build_chat(tokenizer,prompt):
+    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
+
+def build_chat(tokenizer, prompt, model_name, **kwargs):
+    model_name = model_name.lower()
+        # return str or list[int]
+    if "chatglm2" in model_name:
+        prompt = chatglm2_build_chat(tokenizer,prompt)
+    elif "chatglm3" in model_name:
+        prompt = chatglm3_build_chat(tokenizer,prompt)
+    elif "llama2" in model_name and 'chat' in model_name:
+        prompt = llama2_build_chat(tokenizer,prompt)
+    elif "baichuan2" in model_name and 'chat' in model_name:
+        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
+    elif "qwen" in model_name and 'chat' in model_name:
+        prompt = qwen_build_chat(tokenizer,prompt)
+    elif "code" in model_name and 'llama' in model_name:
+        prompt = codellama_build_chat(tokenizer,prompt)
+    else:
+        prompt = default_build_chat(tokenizer,prompt)
+    return prompt
+
+
+# for output
+def default_post_process(output):
+    return output
+
+def glm2_post_process(output):
+    output = output.strip()
+    output = output.replace("[[训练时间]]", "2023年")
+    return output
+
+def glm3_post_process(output, history=[]):
+    content = ""
+    history = deepcopy(history)
+    for response in output.split("<|assistant|>"):
+        metadata, content = response.split("\n", maxsplit=1)
+        if not metadata.strip():
+            content = content.strip()
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            content = content.replace("[[训练时间]]", "2023年")
+        else:
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            if history[0]["role"] == "system" and "tools" in history[0]:
+                content = "\n".join(content.split("\n")[1:-1])
+                def tool_call(**kwargs):
+                    return kwargs
+                parameters = eval(content)
+                content = {"name": metadata.strip(), "parameters": parameters}
+            else:
+                content = {"name": metadata.strip(), "content": content}
+    return content
+
+def post_process(response, model_name,**kwargs):
+    model_name = model_name.lower()
+    if "chatglm2" in model_name:
+        response = glm2_post_process(response)
+    elif "chatglm3" in model_name:
+        response = glm3_post_process(response)
+    else:
+        response = default_post_process(response)
+    return response
\ No newline at end of file