diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..247ba8e387c66c7ad1ca94c1e1760886fbaf3801
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md
@@ -0,0 +1,57 @@
+# ChatGLM3-6B-32K
+
+## Description
+
+ChatGLM3-6B-32K further enhances the understanding of long text capabilities based on ChatGLM3-6B, enabling better handling of contexts up to 32K in length. Specifically, we have updated the positional encoding and designed more targeted long text training methods, using a 32K context length during the training phase. In practical use, if your context length is mostly within 8K, we recommend using ChatGLM3-6B; if you need to handle context lengths exceeding 8K, we recommend using ChatGLM3-6B-32K.
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install transformers==4.37.1
+```
+
+### Download
+
+Pretrained model: <https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k>
+
+```bash
+mkdir -p /data/chatglm/
+mv chatglm3-6b-32k.zip/tar /data/chatglm/
+```
+
+## Run model
+
+```bash
+python3 offline_inference.py --model /data/chatglm/chatglm3-6b-32k --trust-remote-code --temperature 0.0 --max-tokens 256
+```
+
+## Use the server
+
+### Start the server
+
+```bash
+python3 -m vllm.entrypoints.openai.api_server --model /data/chatglm/chatglm3-6b-32k --gpu-memory-utilization 0.9 --max-num-batched-tokens 8193 \
+        --max-num-seqs 32 --disable-log-requests --host 127.0.0.1 --port 12345 --trust-remote-code
+```
+
+### Test using the OpenAI interface
+
+```bash
+python3 server_inference.py --host 127.0.0.1 --port 12345 --model_path /data/chatglm/chatglm3-6b-32k
+```
+
+## Results
+
+| Model           | Precision | tokens | QPS    |
+| --------------- | --------- | ------ | ------ |
+| ChatGLM3-6B-32K | FP16      | 745    | 110.85 |
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc731079f72988cd20c5a68b3ccb4e192769c8fb
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["ÄÄÐ©¼£Ïó¿ÉÄÜ±íÃ÷Ò»¸öÈËÕýÔÚ¾­Àú½¹ÂÇ?", "ÃèÊöÒ»ÏÂÈçºÎÖÆ×÷Ö¥Ê¿ÅûÈø¡£", "Ð´Ò»ÆªÓÐ¹Ø5GÍøÂçÑÐ·¢µÄ×ÛÊöÎÄÕÂ¡£"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        logging.warning(
+            "For now, openai api chat interface(v1/chat/completions) need you provide a chat template to process prompt(str) for better results. "
+            "Otherwise, you have to use the default chat template, which may lead to bad answers. But, the process of building chat input is complex "
+            "for some models and the rule of process can not be written as a jinja file. Fortunately, the v1/completions interface support List[int] "
+            "params. This means you can process the prompt firstly, then send the List[int] to v1/completions and consider it as v1/chat/completions "
+            "to use when you use openai api."
+        )
+        tokenizer = llm.get_tokenizer()
+        prompts_new = []
+        for prompt in prompts:
+            input_idx = (
+                tokenizer.build_chat_input(prompt)["input_ids"][0].cpu().tolist()
+            )
+            prompts_new.append(input_idx)
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6dcf8b88dd25d95c972cee251291c9e515fd9b8
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+import time
+
+from openai import OpenAI
+from transformers import AutoTokenizer
+
+
+def send_request(
+    api_url: str,
+    prompt: str,
+    output_len: int,
+    stream: bool,
+) -> None:
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key="EMPTY",
+        base_url=api_url,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    completion = client.completions.create(
+        model=model,
+        # messages=[{"role": "user", "content": prompt},],
+        prompt=prompt,
+        n=1,
+        stream=stream,
+        max_tokens=output_len,
+        temperature=0.0,
+    )
+
+    if stream:
+        for each_com in completion:
+            print(each_com)
+    else:
+        print("++++++++++++++++++")
+        print(completion)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--stream", action="store_true")
+    parser.add_argument("--output_token", type=int, default=1024)
+    parser.add_argument("--model_path", type=str)
+
+    args = parser.parse_args()
+    api_url = f"http://{args.host}:{args.port}/v1"
+
+    prompts = [
+        "ÄãºÃ",
+        "Which city is the capital of China?",
+        "1 + 1 = ?",
+        "ÖÐ¹úµÄÊ×¶¼ÊÇÄÄÀï",
+        "Çë½²ÒÔÏÂÄÚÈÝ·­ÒëÎªÓ¢ÎÄ£º\nÄãºÃ,ÎÒÀ´×ÔÖÐ¹ú¡£",
+    ]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+    prompts = [tokenizer.build_chat_input(i).input_ids.tolist() for i in prompts]
+
+    for prompt in prompts:
+        send_request(api_url, prompt, args.output_token, args.stream)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc072d8d51e35109a97c17b5476e7bf3aa1448b
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from copy import deepcopy
+from typing import Tuple, List, Union
+
+import codecs
+import logging
+import argparse
+
+# å¯¹äºŽchatæ¨¡åž‹ï¼Œæˆ–è€…æ¨¡åž‹éœ€è¦ç‰¹å®šçš„è¾“å…¥ï¼Œéœ€è¦å¯¹promptè¿›è¡Œé¢å¤–çš„å¤„ç†ã€‚
+# å¦‚æžœæ‚¨åœ¨ä½¿ç”¨ä¸­æœ‰é¢å¤–çš„promptå¤„ç†æ–¹å¼éœ€æ±‚æˆ–è€…é”™è¯¯åé¦ˆï¼Œå¯ä»¥è”ç³»çŽ‹åšæˆ–è€…å·©äºšé£žï¼Œæˆ‘ä»¬ä¼šå¯¹modelzooè¿›è¡Œæ›´æ–°é€‚é…ã€‚
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+        if chat_template is not None:
+            try:
+                with open(chat_template, "r") as f:
+                    tokenizer.chat_template = f.read()
+            except OSError:
+                # If opening a file fails, set chat template to be args to
+                # ensure we decode so our escape are interpreted correctly
+                tokenizer.chat_template = codecs.decode(
+                    chat_template, "unicode_escape")
+
+            logging.info(
+                f"Using supplied chat template:\n{tokenizer.chat_template}"
+            )
+        elif tokenizer.chat_template is not None:
+            logging.info(
+                f"Using default chat template:\n{tokenizer.chat_template}"
+            )
+        else:
+            logging.warning(
+                "No chat template provided. Chat API will not work.")
+
+def default_build_chat(tokenizer,prompt):
+    return prompt
+
+def chatglm2_build_chat(tokenizer,prompt):
+    return tokenizer.build_prompt(prompt)
+
+def chatglm3_build_chat(tokenizer,prompt):
+    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
+
+def llama2_build_chat(tokenizer,prompt):
+    return f"[INST]{prompt}[/INST]"
+
+# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
+def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
+    def _parse_messages(messages, split_role="user"):
+        system, rounds = "", []
+        round = []
+        for i, message in enumerate(messages):
+            if message["role"] == "system":
+                assert i == 0
+                system = message["content"]
+                continue
+            if message["role"] == split_role and round:
+                rounds.append(round)
+                round = []
+            round.append(message)
+        if round:
+            rounds.append(round)
+        return system, rounds
+
+    messages = [{"role": "user", "content": f"{prompt}"}]
+    max_new_tokens = max_new_tokens
+    max_input_tokens = 4096 - max_new_tokens
+    system, rounds = _parse_messages(messages, split_role="user")
+    system_tokens = tokenizer.encode(system)
+    max_history_tokens = max_input_tokens - len(system_tokens)
+
+    history_tokens = []
+    for round in rounds[::-1]:
+        round_tokens = []
+        for message in round:
+            if message["role"] == "user":
+                round_tokens.append(195)
+            else:
+                round_tokens.append(196)
+            round_tokens.extend(tokenizer.encode(message["content"]))
+        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
+            history_tokens = round_tokens + history_tokens  # concat left
+            if len(history_tokens) < max_history_tokens:
+                continue
+        break
+
+    input_tokens = system_tokens + history_tokens
+    if messages[-1]["role"] != "assistant":
+        input_tokens.append(196)
+    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
+    return input_tokens
+
+def qwen_build_chat(
+    tokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set()
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            response_text, response_tokens_part = _tokenize_str(
+                "assistant", turn_response
+            )
+            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+            prev_chat = (
+                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+            )
+
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+
+    return raw_text, context_tokens
+
+def codellama_build_chat(tokenizer,prompt):
+    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
+
+def build_chat(tokenizer, prompt, model_name, **kwargs):
+    model_name = model_name.lower()
+        # return str or list[int]
+    if "chatglm2" in model_name:
+        prompt = chatglm2_build_chat(tokenizer,prompt)
+    elif "chatglm3" in model_name:
+        prompt = chatglm3_build_chat(tokenizer,prompt)
+    elif "llama2" in model_name and 'chat' in model_name:
+        prompt = llama2_build_chat(tokenizer,prompt)
+    elif "baichuan2" in model_name and 'chat' in model_name:
+        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
+    elif "qwen" in model_name and 'chat' in model_name:
+        prompt = qwen_build_chat(tokenizer,prompt)
+    elif "code" in model_name and 'llama' in model_name:
+        prompt = codellama_build_chat(tokenizer,prompt)
+    else:
+        prompt = default_build_chat(tokenizer,prompt)
+    return prompt
+
+
+# for output
+def default_post_process(output):
+    return output
+
+def glm2_post_process(output):
+    output = output.strip()
+    output = output.replace("[[è®­ç»ƒæ—¶é—´]]", "2023å¹´")
+    return output
+
+def glm3_post_process(output, history=[]):
+    content = ""
+    history = deepcopy(history)
+    for response in output.split("<|assistant|>"):
+        metadata, content = response.split("\n", maxsplit=1)
+        if not metadata.strip():
+            content = content.strip()
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            content = content.replace("[[è®­ç»ƒæ—¶é—´]]", "2023å¹´")
+        else:
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            if history[0]["role"] == "system" and "tools" in history[0]:
+                content = "\n".join(content.split("\n")[1:-1])
+                def tool_call(**kwargs):
+                    return kwargs
+                parameters = eval(content)
+                content = {"name": metadata.strip(), "parameters": parameters}
+            else:
+                content = {"name": metadata.strip(), "content": content}
+    return content
+
+def post_process(response, model_name,**kwargs):
+    model_name = model_name.lower()
+    if "chatglm2" in model_name:
+        response = glm2_post_process(response)
+    elif "chatglm3" in model_name:
+        response = glm3_post_process(response)
+    else:
+        response = default_post_process(response)
+    return response
\ No newline at end of file