diff --git a/.gitignore b/.gitignore
index c70c40e0f527c8c20a6bf994bcb8070b95e13e27..2417a7f3477ee3d635fb09975cbe0473f2637031 100644
--- a/.gitignore
+++ b/.gitignore
@@ -142,4 +142,7 @@ cython_debug/
 att_advisor*.html
 *.xlsx
 operator_tuning_file*.cfg
-.ipynb_checkpoints/
\ No newline at end of file
+.ipynb_checkpoints/
+
+# pycharm settings
+.idea
\ No newline at end of file
diff --git a/debug/OWNERS b/debug/OWNERS
index 09121722c9d7147133c6f111cd10b279979ebdb3..311da9c60cb527eff4feb755c3a012fc042e3afb 100644
--- a/debug/OWNERS
+++ b/debug/OWNERS
@@ -5,7 +5,11 @@ approvers:
 - kun_8
 - binghamhuang
 - brightlyking
+- litian_drinksnow
 reviewers:
 - lv-kaimeng
-- litian_drinksnow
 - binghamhuang
+- TAJh
+- jiandaobao
+- pengxiaopeng1
+- zhengxinqian
diff --git a/debug/accuracy_tools/api_accuracy_checker/generate_op_script/op_generator.py b/debug/accuracy_tools/api_accuracy_checker/generate_op_script/op_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d3e2b226bde101c5aeba46f5353cbf3c53549d0
--- /dev/null
+++ b/debug/accuracy_tools/api_accuracy_checker/generate_op_script/op_generator.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import argparse
+import json
+import os
+import math
+import numpy as np
+import torch
+try:
+    import torch_npu
+except ImportError:
+    pass
+
+from api_accuracy_checker.compare.compare_utils import BinaryStandardApi, AbsoluteStandardApi, ULPStandardApi
+
+
+TENSOR_DATA_LIST = ["torch.Tensor"]
+TORCH_BOOL_TYPE = ["torch.bool"]
+TORCH_INT_TYPE = ["torch.uint8", "torch.int8", "torch.int16", "torch.short", "torch.int32", "torch.int",
+                  "torch.int64", "torch.long"]
+TORCH_FLOAT_TYPE = ["torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.float", 
+                    "torch.float64", "torch.double"]
+TORCH_COMPLEX_TYPE = ["torch.complex32", "torch.chalf", "torch.complex64", "torch.cfloat", "torch.complex128", "torch.cdouble"]
+
+
+def check_json(json_path):
+    json_file = os.path.realpath(json_path)
+    with open(json_file) as f:
+        json_content = json.load(f)
+    if not isinstance(json_content, dict):
+        raise ValueError("content of json file is not a dictionary!")
+    if len(list(json_content.items())) > 1:
+        raise ValueError("json file has more than one API, only one API is allowed!")
+    (api_full_name, api_info_dict) = list(json_content.items())[0]
+    (api_type, api_name, ordinal_number) = api_full_name.split(".", -1)
+    if api_type not in ("Functional", "Tensor", "Torch"):
+        raise ValueError("type {0} of API is not supported!".format(api_type))
+    return (api_full_name, api_info_dict)
+
+
+def check_user_settings(cmd_args):
+    iter_t = cmd_args.iter_times
+    if iter_t <= 0:
+        raise ValueError("iter_times should be an integer bigger than zero!")
+    (api_full_name, api_info_dict) = check_json(cmd_args.forward_json_path)    
+    return api_full_name, api_info_dict
+
+
+def get_compare_standard(api_name):
+    if api_name in BinaryStandardApi:
+        return "CompareStandard.BINARY_EQUALITY_STANDARD"
+    if api_name in AbsoluteStandardApi:
+        return "CompareStandard.ABSOLUTE_THRESHOLD_STANDARD"
+    if api_name in ULPStandardApi:
+        return "CompareStandard.ULP_ERROR_STANDARD"
+    return "CompareStandard.BENCHMARK_STANDARD"
+
+
+def get_settings(cmd_args):
+    '''    
+    internal_settings contain all information needed for the operator program.
+    keys:
+        api_full_name: api_type.api_name.ordinal_number
+        api_type: type of API, one of torch.nn.functional, torch.Tensor or Torch
+        api_name: name of API
+        ordinal_number: how many times the same api has been called
+        direction_status: forward
+        random_seed: if mode is random_data, random seed is random_seed
+        iter_times: if mode is random_data, generate iter_times group of data; if mode is real_data, iter_times does not matter
+        args_element_assignment: code for args assignment
+        args_list_generator_device: code for generate args list on device
+        args_list_generator_bench: code for generate args list on bench
+        kwargs_value_assignment: code for kwargs assignment
+        kwargs_dict_generator_device: code for generate kwargs dict on device
+        kwargs_dict_generator_bench: code for generate kwargs dict on bench
+    '''    
+    api_full_name, api_info_dict = check_user_settings(cmd_args)
+    args_info = api_info_dict.get("args")
+    kwargs_info = api_info_dict.get("kwargs")
+
+    internal_settings = {}
+    internal_settings["api_full_name"] = api_full_name
+    (api_type, api_name, ordinal_number) = api_full_name.split(".", -1)
+    if api_type == "Functional":
+        internal_settings["api_type"] = "torch.nn.functional"
+    elif api_type == "Tensor":
+        internal_settings["api_type"] = "torch.Tensor"
+    else:
+        internal_settings["api_type"] = "torch"
+    internal_settings["api_name"] = api_name
+    internal_settings["compare_standard"] = get_compare_standard(api_name)
+    internal_settings["ordinal_number"] = ordinal_number
+    internal_settings["direction_status"] = "forward"
+    internal_settings["random_seed"] = cmd_args.random_seed
+    if cmd_args.mode == "real_data":
+        internal_settings["iter_times"] = 1
+    else:
+        internal_settings["iter_times"] = cmd_args.iter_times
+    internal_settings["args_element_assignment"] = generate_args_element_assignment_code(args_info)
+    internal_settings["args_list_generator_device"] = generate_args_list_device(args_info)
+    internal_settings["args_list_generator_bench"] = generate_args_list_bench(args_info)
+    internal_settings["kwargs_value_assignment"] = generate_kwargs_value_assignment_code(kwargs_info)
+    internal_settings["kwargs_dict_generator_device"] = generate_kwargs_dict_device(kwargs_info)
+    internal_settings["kwargs_dict_generator_bench"] = generate_kwargs_dict_bench(kwargs_info)
+    return internal_settings
+
+
+def recursive_args_element_assignment(args_info, name_number):
+    args_element_assignment = ""
+    for index, arg in enumerate(args_info):
+        if isinstance(arg, (list, tuple)):
+            new_args_element_assignment = recursive_args_element_assignment(arg, name_number + "_" + str(index))
+            args_element_assignment += new_args_element_assignment
+        else:
+            arg["parameter_name"] = "arg" + name_number + "_" + str(index)
+            args_element_assignment += "    " + "arg_info" + name_number + "_" + str(index) + " = " + "{}".format(str(arg)) + "\n"
+            args_element_assignment += "    " + "arg" + name_number + "_" + str(index) + " = " + "generate_data(arg_info" + name_number + "_" + str(index) + ")" + "\n"
+    return args_element_assignment
+
+
+def generate_args_element_assignment_code(args_info):
+    args_element_assignment = recursive_args_element_assignment(args_info, "")
+    return args_element_assignment
+
+
+def recursive_args_list(args_info, flag_device=False, flag_bench=False):
+    args_list_generator = ""
+    for index, arg in enumerate(args_info):
+        if isinstance(arg, (list, tuple)):
+            (left_bracket, right_bracket) = ("[", "]") if isinstance(arg, list) else ("(", ")")
+            args_list_generator += left_bracket
+            new_args_list_generator = recursive_args_list(arg, flag_device=flag_device, flag_bench=flag_bench)
+            args_list_generator += new_args_list_generator
+            args_list_generator += right_bracket
+        else:
+            args_list_generator += arg.get("parameter_name")
+            if arg.get("type") in TENSOR_DATA_LIST:
+                if flag_device:
+                    args_list_generator += ".to(device)"
+                if flag_bench:
+                    args_list_generator += '.to(torch.device("cpu"))'
+                    args_list_generator += ".to(RAISE_PRECISION.get(str(" + arg.get("parameter_name") + ".dtype), " + arg.get("parameter_name") + ".dtype))"
+        args_list_generator += ", "
+    return args_list_generator
+
+
+def generate_args_list_device(args_info):
+    args_list_generator_device = recursive_args_list(args_info, flag_device=True)
+    return args_list_generator_device
+
+
+def generate_args_list_bench(args_info):
+    args_list_generator_bench = recursive_args_list(args_info, flag_bench=True)
+    return args_list_generator_bench
+
+
+def recursive_kwargs_value_assignment(info, key_name, name_number):
+    kwargs_value_assignment = ""
+    if isinstance(info, dict):
+        if info.get("type") == "torch.device" or info.get("type") == "torch.dtype":
+            kwargs_value_assignment += "    " + "kwarg_" + key_name + name_number + " = " + info.get("value")
+        else:
+            kwargs_value_assignment += "    " + "kwarg_info_" + key_name + name_number + " = " + "{}".format(str(info)) + "\n"
+            kwargs_value_assignment += "    " + "kwarg_" + key_name + name_number + " = " + "generate_data(kwarg_info_" + key_name + name_number + ")" + "\n"
+        info["parameter_name"] = "kwarg_" + key_name + name_number
+    else:
+        for index, arg in enumerate(info):
+            new_kwargs_value_assignment = recursive_kwargs_value_assignment(arg, key_name, name_number + "_" + str(index))
+            kwargs_value_assignment += new_kwargs_value_assignment
+    return kwargs_value_assignment
+
+
+def generate_kwargs_value_assignment_code(kwargs_info):
+    kwargs_value_assignment = ""
+    for key, value in kwargs_info.items():
+        kwargs_value_assignment += recursive_kwargs_value_assignment(value, key, "")
+    return kwargs_value_assignment
+
+
+def recursive_kwargs_dict(info, flag_device=False, flag_bench=False):
+    kwargs_dict_generator = ""
+    if isinstance(info, dict):
+        kwargs_dict_generator += info.get("parameter_name")
+        if info.get("type") in TENSOR_DATA_LIST:
+            if flag_device:
+                kwargs_dict_generator += ".to(device)"
+            if flag_bench:
+                kwargs_dict_generator += '.to(torch.device("cpu"))'
+                kwargs_dict_generator += ".to(RAISE_PRECISION.get(str(" + info.get("parameter_name") + ".dtype), " + info.get("parameter_name") + ".dtype))"
+    else:
+        (left_bracket, right_bracket) = ("[", "]") if isinstance(info, list) else ("(", ")")
+        kwargs_dict_generator += left_bracket
+        for arg in info:
+            kwargs_dict_generator += recursive_kwargs_dict(arg, flag_device=flag_device, flag_bench=flag_bench)
+            kwargs_dict_generator += ", "
+        kwargs_dict_generator += right_bracket
+    return kwargs_dict_generator
+
+
+def generate_kwargs_dict_device(kwargs_info):
+    kwargs_dict_generator_device = ""
+    for key, value in kwargs_info.items():
+        kwargs_dict_generator_device += '"' + key + '"' + ": "
+        kwargs_dict_generator_device += recursive_kwargs_dict(value, flag_device=True) + ", "
+    return kwargs_dict_generator_device
+
+
+def generate_kwargs_dict_bench(kwargs_info):
+    kwargs_dict_generator_bench = ""
+    for key, value in kwargs_info.items():
+        kwargs_dict_generator_bench += '"' + key + '"' + ": "
+        kwargs_dict_generator_bench += recursive_kwargs_dict(value, flag_bench=True) + ", "
+    return kwargs_dict_generator_bench
+
+
+def op_generator_parser(parser):
+    parser.add_argument("-forward", "--forward_json_path", dest="forward_json_path", type=str,
+                        help="<Required> Path of forward API json file.",
+                        required=True)
+    parser.add_argument("-m", "--mode", dest="mode", type=str, choices=("random_data", "real_data"),
+                        help="<Required> Execute mode, should be random_data or real_data.",
+                        required=True)
+    parser.add_argument("-rs", "--random_seed", dest = "random_seed", type=int, default=1234,
+                        help="<Optional> If mode is random_data, it is random seed.",
+                        required=False)
+    parser.add_argument("-it", "--iter_times", dest="iter_times", type=int, default=5,
+                        help="<Optional> If mode is random_data, generate iter_times group of data.",
+                        required=False)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    op_generator_parser(parser)
+    cmd_args = parser.parse_args()
+    internal_settings = get_settings(cmd_args)
+
+    template_path = os.path.join(os.path.dirname(__file__), "operator_replication.template")
+    operator_script_path = os.path.join(os.path.dirname(__file__), "{0}.py".format(internal_settings.get("api_full_name")))
+
+    try:
+        with open(template_path, 'r') as ftemp, open(operator_script_path, 'w') as fout:
+            code_template = ftemp.read()
+            fout.write(code_template.format(**internal_settings))
+    except OSError:
+        print(f"Failed to open file. Please check file {template_path} or {operator_script_path}.")
+
+    print(f"Generate operator script successfully and the name is {operator_script_path}.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/debug/accuracy_tools/api_accuracy_checker/generate_op_script/operator_replication.template b/debug/accuracy_tools/api_accuracy_checker/generate_op_script/operator_replication.template
new file mode 100644
index 0000000000000000000000000000000000000000..7630839aa937c6d0419629b5e93c34b51b71f295
--- /dev/null
+++ b/debug/accuracy_tools/api_accuracy_checker/generate_op_script/operator_replication.template
@@ -0,0 +1,325 @@
+import json
+import os
+import math
+from enum import Enum, auto
+import torch
+try:
+    import torch_npu
+except ImportError:
+    pass
+
+
+TENSOR_DATA_LIST = ["torch.Tensor", "torch.nn.parameter.Parameter"]
+TORCH_BOOL_TYPE = ["torch.bool"]
+TORCH_INT_TYPE = ["torch.uint8", "torch.int8", "torch.int16", "torch.short", "torch.int32", "torch.int",
+                  "torch.int64", "torch.long"]
+TORCH_FLOAT_TYPE = ["torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.float", 
+                    "torch.float64", "torch.double"]
+TORCH_COMPLEX_TYPE = ["torch.complex32", "torch.chalf", "torch.complex64", "torch.cfloat", "torch.complex128", "torch.cdouble"]
+RAISE_PRECISION = {{
+    "torch.float16": torch.float32,
+    "torch.half": torch.float32,
+    "torch.bfloat16": torch.float32,
+    "torch.float32": torch.float64,
+    "torch.float": torch.float64
+}}
+
+
+class CompareStandard(Enum):
+    BINARY_EQUALITY_STANDARD = auto()
+    ABSOLUTE_THRESHOLD_STANDARD = auto()
+    ULP_ERROR_STANDARD = auto()
+    BENCHMARK_STANDARD = auto()
+
+
+def get_device():
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch_npu.npu.is_available():
+        device = torch.device("npu")
+    else:
+        raise Exception("Error: This device is not NPU or GPU!")
+    return device
+
+
+def generate_bool_tensor(low, high, shape):
+    low, high = int(low), int(high)
+    tensor = torch.randint(low, high + 1, shape)
+    bool_tensor = torch.gt(tensor, 0)
+    return bool_tensor
+
+
+def generate_numerical_tensor(low, high, shape, data_dtype):
+    if data_dtype in TORCH_FLOAT_TYPE:
+        scale = high - low
+        rand01 = torch.rand(shape, dtype=eval(data_dtype))
+        tensor = rand01 * scale + low
+    elif data_dtype in TORCH_INT_TYPE:
+        low, high = int(low), int(high)
+        tensor = torch.randint(low, high + 1, shape, dtype=eval(data_dtype))
+    else:
+        raise NotImplementedError(f"{{data_dtype}} is not supported!")
+    if torch.numel(tensor) == 0:
+        return tensor
+    tmp_tensor = tensor.reshape(-1)
+    tmp_tensor[0] = low
+    tmp_tensor[-1] = high
+    data = tmp_tensor.reshape(shape)
+    return data
+
+
+def generate_random_tensor(info):
+    low, high = info.get('Min'), info.get('Max')
+    data_dtype = info.get('dtype')
+    shape = tuple(info.get('shape'))
+    if data_dtype == "torch.bool":
+        data = generate_bool_tensor(low, high, shape)
+    else:
+        data = generate_numerical_tensor(low, high, shape, data_dtype)
+    return data
+
+
+def generate_real_tensor(data_path):
+    data_path = os.path.realpath(data_path)
+    data = torch.load(data_path)    
+    return data
+
+
+def generate_data(info):
+    data_type = info.get("type")
+    data_path = info.get("datapath")
+    if data_type in TENSOR_DATA_LIST:
+        if data_path:
+            data = generate_real_tensor(data_path)
+        else:
+            data = generate_random_tensor(info)
+    else:
+        data = info.get("value")
+    return data
+
+
+def get_input():
+{args_element_assignment}
+    args_device = [{args_list_generator_device}]
+    args_bench = [{args_list_generator_bench}]
+{kwargs_value_assignment}
+    kwargs_device = {{{kwargs_dict_generator_device}}}
+    kwargs_bench = {{{kwargs_dict_generator_bench}}}
+    return args_device, kwargs_device, args_bench, kwargs_bench
+
+
+def exec_api_device(args, kwargs):
+    output_device = {api_type}.{api_name}(*args, **kwargs)
+    return output_device
+
+
+def exec_api_bench(args, kwargs):
+    output_bench = {api_type}.{api_name}(*args, **kwargs)
+    return output_bench
+
+
+def compute_inf_nan_proportion(inf_nan_mask, out_device, out_bench, abs_bench_with_eps, rtol):
+    out_bench = out_bench.to(out_device.dtype)
+    min = torch.finfo(out_device.dtype).min
+    max = torch.finfo(out_device.dtype).max
+    bench_clip = torch.clamp(out_bench, min=min, max=max)
+    device_clip = torch.clamp(out_device, min=min, max=max)
+    clipped_abs_ae = torch.abs(device_clip - bench_clip)
+    clipped_re = clipped_abs_ae / abs_bench_with_eps
+    pass_mask = torch.less_equal(clipped_re, rtol)
+    both_nan_mask = torch.logical_and(torch.isnan(out_device), torch.isnan(bench_clip))
+    pass_mask = torch.logical_or(pass_mask, both_nan_mask)
+    not_pass_mask = torch.logical_not(pass_mask)
+    not_pass_mask = torch.logical_and(not_pass_mask, inf_nan_mask)
+    inf_nan_err_cnt = torch.sum(not_pass_mask)
+    return 0 if torch.sum(inf_nan_mask) == 0 else inf_nan_err_cnt / torch.sum(inf_nan_mask)
+
+
+def compute_rmse(abs_err, normal_value_mask):
+    if torch.sum(normal_value_mask) == 0:
+        return 0
+    else:
+        masked_ae = torch.where(normal_value_mask, abs_err, 0)
+        mse = torch.sum(torch.square(masked_ae)) / torch.sum(normal_value_mask)
+        rmse = torch.sqrt(mse)
+        return rmse
+
+
+def compute_error_balance(out_device, out_bench):
+    larger_count = torch.sum(torch.greater(out_device - out_bench.to(out_device.dtype), 0))
+    smaller_count = torch.sum(torch.less(out_device - out_bench.to(out_device.dtype), 0))
+    total_count = torch.numel(out_bench)
+    error_balance = abs(larger_count - smaller_count) / total_count
+    return error_balance
+
+
+def compare_tensor(out_device, out_bench, api_name):
+    if out_device.shape != out_bench.shape:
+        print("ERROR: shape of out_device and out_bench is not equal!")
+        return None
+    if torch.numel(out_bench) == 0:
+        print("Both out_device and out_bench have zero elements.")
+        return None
+    print(f"shape is {{out_bench.shape}}")
+    print(f"dtype of out_device is {{out_device.dtype}}")
+    print(f"dtype of out_bench is {{out_bench.dtype}}")
+    dtype_device = out_device.dtype
+    dtype_bench = out_bench.dtype
+    if str(dtype_device) in TORCH_FLOAT_TYPE and str(dtype_bench) in TORCH_FLOAT_TYPE \
+    or str(dtype_device) in TORCH_INT_TYPE and str(dtype_bench) in TORCH_INT_TYPE \
+    or str(dtype_device) in TORCH_BOOL_TYPE and str(dtype_bench) in TORCH_BOOL_TYPE:
+        out_device = out_device.to(torch.device("cpu"))
+        if str(dtype_device) in TORCH_BOOL_TYPE or str(dtype_device) in TORCH_INT_TYPE or compare_standard == CompareStandard.BINARY_EQUALITY_STANDARD:
+            print("compare standard: binary equality standard:")
+            error_number = torch.sum(out_device != out_bench).item()
+            error_rate = error_number / torch.numel(out_bench)
+            print(f"error rate is {{error_rate}}.")
+        else:
+            abs_err = torch.abs(out_device - out_bench)
+            abs_bench = torch.abs(out_bench)
+            if dtype_bench == torch.float32:
+                eps = 2 ** -23
+            if dtype_bench == torch.float64:
+                eps = 2 ** -52
+            abs_bench_with_eps = abs_bench + eps
+            rel_err = torch.abs(abs_err / abs_bench_with_eps)
+            device_finite_mask = torch.isfinite(out_device)
+            bench_finite_mask = torch.isfinite(out_bench.to(dtype_device))
+            both_finite_mask = torch.logical_and(device_finite_mask, bench_finite_mask)
+            inf_nan_mask = torch.logical_not(both_finite_mask)
+            if compare_standard == CompareStandard.ABSOLUTE_THRESHOLD_STANDARD:
+                if dtype_device == torch.float16:
+                    rtol, small_value, small_value_atol = 1.0e-3, 1.0e-3, 1.0e-5
+                elif dtype_device == torch.bfloat16:
+                    rtol, small_value, small_value_atol = 4.0e-3, 1.0e-3, 1.0e-5
+                else:
+                    rtol, small_value, small_value_atol = 1.0e-6, 1.0e-6, 1.0e-9
+                small_value_mask = torch.less_equal(abs_bench, small_value)
+                small_value_mask = torch.logical_and(small_value_mask, both_finite_mask)
+                normal_value_mask = torch.logical_and(both_finite_mask, torch.logical_not(small_value_mask))
+                inf_nan_proportion = compute_inf_nan_proportion(inf_nan_mask, out_device, out_bench, abs_bench_with_eps, rtol)
+                rel_err_mask = torch.greater(rel_err, rtol)
+                rel_err_mask = torch.logical_and(rel_err_mask, normal_value_mask)
+                if torch.sum(normal_value_mask) == 0:
+                    rel_err_proportion = 0
+                else:
+                    rel_err_proportion = torch.sum(rel_err_mask) / torch.sum(normal_value_mask)
+                abs_err_mask = torch.greater(abs_err, small_value_atol)
+                abs_err_mask = torch.logical_and(abs_err_mask, small_value_mask)
+                if torch.sum(small_value_mask) == 0:
+                    abs_err_proportion = 0
+                else:
+                    abs_err_proportion = torch.sum(abs_err_mask) / torch.sum(small_value_mask)
+                print("compare standard: absolute threshold standard")
+                print(f"relative error ratio is {{rel_err_proportion}}")
+                print(f"absolute error ratio is {{abs_err_proportion}}")
+            elif compare_standard == CompareStandard.ULP_ERROR_STANDARD:
+                if dtype_device == torch.float16:
+                    min_eb, exponent_num = -14, 10
+                elif dtype_device == torch.bfloat16:
+                    min_eb, exponent_num = -126, 7
+                else:
+                    min_eb, exponent_num = -126, 23
+                eb = torch.where(abs_bench == 0, torch.zeros(out_bench.shape), torch.floor(torch.log2(abs_bench)))
+                eb = torch.maximum(eb, min_eb * torch.ones(out_bench.shape))
+                if dtype_device == torch.float32:
+                    ulp_err = (out_device.to(torch.float64) - out_bench).to(torch.float64) * torch.exp2(-eb + exponent_num).to(torch.float64)
+                else:
+                    ulp_err = (out_device.to(torch.float32) - out_bench).to(torch.float32) * torch.exp2(-eb + exponent_num).to(torch.float32)
+                ulp_err = torch.abs(ulp_err)
+                max_ulp_err = torch.max(ulp_err)
+                mean_ulp_err = torch.mean(ulp_err)
+                if dtype_device == torch.float32:
+                    ulp_err_proportion = torch.sum(ulp_err > 32) / torch.numel(out_bench)
+                else:
+                    ulp_err_proportion = torch.sum(ulp_err > 1) / torch.numel(out_bench)
+                print("compare standard: ulp error standard")
+                print(f"maximum ulp error is {{max_ulp_err}}")
+                print(f"mean ulp error is {{mean_ulp_err}}")
+                print(f"ulp error proportion is {{ulp_err_proportion}}")
+            else:
+                if dtype_device == torch.float16:
+                    small_value, small_value_atol = 1.0e-3, 1.0e-5
+                elif dtype_device == torch.bfloat16:
+                    small_value, small_value_atol = 1.0e-3, 1.0e-5
+                else:
+                    small_value, small_value_atol = 1.0e-6, 1.0e-9
+                small_value_mask = torch.less_equal(abs_bench, small_value)
+                small_value_mask = torch.logical_and(small_value_mask, both_finite_mask)
+                normal_value_mask = torch.logical_and(both_finite_mask, torch.logical_not(small_value_mask))
+                abs_err_mask = torch.greater(abs_err, small_value_atol)
+                abs_err_mask = torch.logical_and(abs_err_mask, small_value_mask)
+                if torch.sum(small_value_mask) == 0:
+                    small_value_err_proportion = 0
+                else:
+                    small_value_err_proportion = torch.sum(abs_err_mask) / torch.sum(small_value_mask)
+                rel_err = torch.where(normal_value_mask, rel_err, -1 * torch.ones(out_device.shape))
+                if torch.max(rel_err) >= 0:
+                    max_rel_err = torch.max(rel_err)
+                else:
+                    max_rel_err = 0
+                if torch.sum(normal_value_mask) == 0:
+                    mean_rel_err = 0
+                else:
+                    mean_rel_err = torch.sum(torch.clamp(rel_err, min=0)) / torch.sum(normal_value_mask)
+                rmse = compute_rmse(abs_err, normal_value_mask)
+                error_balance = compute_error_balance(out_device, out_bench)
+                print("compare standard: benchmark standard")
+                print(f"small value error proportion is {{small_value_err_proportion}}")
+                print(f"maximum relative error is {{max_rel_err}}")
+                print(f"mean relative error is {{mean_rel_err}}")
+                print(f"root mean squared error is {{rmse}}")
+                print(f"error balance is {{error_balance}}")
+    else:
+        print(f"ERROR: out_device dtype is {{dtype_device}}, out_bench dtype is {{dtype_bench}}, not comparable.")
+    return None
+
+
+def compare_element(out_device, out_bench, api_name):
+    if type(out_device) != type(out_bench):
+        print("ERROR: out_device and out_bench is not the same type!")
+        return None
+    if isinstance(out_bench, torch.Tensor):
+        print(f"data type: {{type(out_bench)}}")
+        compare_tensor(out_device, out_bench, api_name)
+    elif isinstance(out_bench, (bool, int, float, str)):
+        print(f"data type: {{type(out_bench)}}")
+        if out_device == out_bench:
+            print("PASS: out_device and out_bench equals.")
+        else:
+            print("ERROR: out_device and out_bench is not equal!")
+    else:
+        print(f"ERROR: comparison of type {{type(out_bench)}} is not supported.")
+    return None
+
+
+def compare(out_device, out_bench, api_name):
+    print("Compare result:")
+    if type(out_device) != type(out_bench):
+        print("ERROR: out_device and out_bench is not the same type!")
+        print("Compare finished.")
+        return None
+    if isinstance(out_bench, (list, tuple)):
+        print(f"data type: {{type(out_bench)}}")
+        if len(out_device) != len(out_bench):
+            print("ERROR: len of out_device and out_bench is different!")
+            print("Compare finished.")
+            return None
+        for index, _ in enumerate(out_bench):
+            print(f"index {{index}}:")
+            compare_element(out_device[index], out_bench[index], api_name)
+    else:
+        compare_element(out_device, out_bench, api_name)
+    print("Compare finished.")
+
+
+device = get_device()
+api_name = "{api_name}"
+compare_standard = {compare_standard}
+torch.manual_seed({random_seed})
+for i in range({iter_times}):
+    print(f"iter: {{i}}:")
+    args_device, kwargs_device, args_bench, kwargs_bench = get_input()
+    output_device = exec_api_device(args_device, kwargs_device)
+    output_bench = exec_api_bench(args_bench, kwargs_bench)
+    compare(output_device, output_bench, api_name)
diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py
deleted file mode 100644
index cbd7b6daa2f0d9b0a9b28016993e836ee07df72d..0000000000000000000000000000000000000000
--- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import statistics  as st
-from abc import ABC
-from typing import List
-import sys
-from torch.utils.tensorboard import SummaryWriter
-from collections import defaultdict
-from kj600.utils import print_info_log
-
-class ScanRule(ABC):
-    def apply(self, history, cur):
-        raise NotImplementedError("abstract method apply is not implemented")
-
-class AnomalyTurbulence(ScanRule):
-    name = "AnomalyTurbulence"
-    def __init__(self, threshold) -> None:
-        self.threshold = threshold
-    def apply(self, history, cur):
-        baseline = st.mean(history) if isinstance(history, list) else history
-        
-        up_bound = baseline + baseline * self.threshold
-        if baseline > 0:
-            return cur > up_bound
-        else:
-            return cur < up_bound
-
-class AnomalyScanner:
-
-    @staticmethod
-    def load_rules(specs: List[dict]):
-        if specs is None:
-            return []
-        alert_rules = []
-        for spec in specs:
-            rule_cls_name = spec["rule_name"]
-            rule_args = spec["args"]
-            cur_module = sys.modules[__name__]
-            rule_cls = getattr(cur_module, rule_cls_name)
-            rule_instance = rule_cls(**rule_args)
-            alert_rules.append(rule_instance)
-        return alert_rules
-
-    @staticmethod
-    def scan(scan_rules: List[ScanRule], history, cur):
-        anomaly = False
-        for rule in scan_rules:
-            anomaly = rule.apply(history, cur)
-            if anomaly:
-                return anomaly, rule.name
-        return anomaly, None
-
-class bcolors:
-    HEADER = '\033[95m'
-    OKBLUE = '\033[94m'
-    OKCYAN = '\033[96m'
-    OKGREEN = '\033[92m'
-    WARNING = '\033[93m'
-    FAIL = '\033[91m'
-    ENDC = '\033[0m'
-    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
-
-class SummaryWriterWithAD(SummaryWriter):
-    def __init__(self, path, ad_rules, job_id, anomaly_inform=False):
-        super().__init__(path)
-        self.tag2scalars = defaultdict(list)
-        self.ad_rules = ad_rules
-        self.job_id = job_id
-        self.anomaly_inform = anomaly_inform
-    
-    def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False):
-        new_avg = avg = scalar_value
-        if tag in self.tag2scalars:
-            N = len(self.tag2scalars[tag])
-            _, avg = self.tag2scalars[tag][-1]
-            new_avg = (avg*N + scalar_value)/(N + 1)
-        self.tag2scalars[tag].append((scalar_value, new_avg))    
-        detected, rule_name = self._ad(scalar_value, history=avg)
-        if detected:
-            print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}")
-            exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}"
-            if self.anomaly_inform:
-                self.anomaly_inform.run(exception_message, self.job_id)
-        return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision)
-    
-    def _ad(self, scalar_value, history):
-        return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value)
diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py
deleted file mode 100644
index 285f17ca6dc6a00814b0847c7d203524d8a8caa6..0000000000000000000000000000000000000000
--- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from collections import defaultdict
-import torch
-import torch.distributed as dist
-from kj600.visualizer import HeatmapVisualizer
-
-
-def print_rank_0(message, debug=False, force=False):
-    if dist.is_initialized():
-        if dist.get_rank() == 0:
-            print(message)
-    else:
-        print(message)
-
-
-class MixPrecsionOptimizerMon:
-    wrapped_optimizer = None
-
-    def __init__(self) -> None:
-        self.fp16_to_fp32_param = {}
-
-    @staticmethod
-    def set_wrapped_optimizer(_wrapped_optimizer):
-        MixPrecsionOptimizerMon.wrapped_optimizer = _wrapped_optimizer
-
-    # parameter tensors we want to monitor and their names are in params2name_dict
-    # base_optimizer is pytorch optimizer, wrapped_optimizer is a normal object with  base_optimizer
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer
-
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            for fp16_group, fp32_group in zip(mix_prec_opt.float16_groups, mix_prec_opt.fp32_from_float16_groups):
-                for fp16_param, fp32_param in zip(fp16_group, fp32_group):
-                    self.fp16_to_fp32_param[fp16_param] = fp32_param
-        return self._fetch_mv_in_adam(params2name, torch_opt, monitor)
-
-    def _fetch_mv_in_adam(self, params2name, torch_opt, monitor):
-        exp_avg_dict = defaultdict(float)
-        exp_avg_sq_dict = defaultdict(float)
-        update_dict = defaultdict()
-        ratio_dict = defaultdict()
-
-        for param, name in params2name.items():
-            if param in self.fp16_to_fp32_param:
-                param = self.fp16_to_fp32_param[param]
-            
-            if param in torch_opt.state:
-                exp_avg = torch_opt.state[param]["exp_avg"]
-                exp_avg_sq = torch_opt.state[param]["exp_avg_sq"]
-                if monitor.mv_distribution:
-                    exp_avg_dict[name] = exp_avg
-                    exp_avg_sq_dict[name] = exp_avg_sq
-                if monitor.mg_direction:
-                    exp_avg_dict[name] = exp_avg
-                if monitor.ur_distribution:
-                    update_dict[name] = exp_avg / (torch.sqrt(exp_avg_sq) + torch_opt.defaults['eps'])
-                    ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq)
-                    monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
-                    monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
-        return exp_avg_dict, exp_avg_sq_dict, update_dict, ratio_dict
-
-
-class MegatronDistributedOptimizerMon(MixPrecsionOptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer
-        if not (hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt, "shard_fp32_from_float16_groups")):
-            raise Exception("megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, \
-                             if not, please check megatron-lm version")
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups, mix_prec_opt.shard_fp32_from_float16_groups):
-                for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group):
-                    self.fp16_to_fp32_param[fp16_param] = shard_fp32_param
-
-        return self._fetch_mv_in_adam(params2name, torch_opt, monitor)
-
-
-class DummyOptimizerMon(MixPrecsionOptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        return None, None, None, None
-
-
-class OptimizerMonFactory:
-    @staticmethod
-    def create_optimizer_mon(opt_ty:str):
-        if opt_ty == "Megatron_Float16OptimizerWithFloat16Params":
-            return MixPrecsionOptimizerMon()
-        if opt_ty == "Megatron_DistributedOptimizer":
-            return MegatronDistributedOptimizerMon()
-        if opt_ty is None or opt_ty == "unknown":
-            return DummyOptimizerMon()
-        raise Exception("opt_ty should be Megatron_Float16OptimizerWithFloat16Params or Megatron_DistributedOptimizer or None or unknown")
diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/monitor/README.md
similarity index 49%
rename from debug/accuracy_tools/kj600/README.md
rename to debug/accuracy_tools/monitor/README.md
index 1782e58bec0404092bb8c6784a8235c68f536ac9..9882bfbfd3533cb8b9b211857c3f75db5e448aff 100644
--- a/debug/accuracy_tools/kj600/README.md
+++ b/debug/accuracy_tools/monitor/README.md
@@ -1,4 +1,4 @@
-# TensorProbe (codename:kj600) 模型训练状态监控工具
+# Monitor 模型训练状态监控工具
 
 ## 简介
 
@@ -10,37 +10,173 @@
 
 | 依赖软件        |
 |-------------|
-| torch       |
+| torch>=2.0       |
 | torch_npu   |
 | torchvision |
 | tensorboard |
 | matplotlib  |
-| sqlalchemy  |
-| pymysql     |
 
-###  2. 安装 kj600
+###  2. 安装 monitor
 
-方式一：从 git 直接安装
+方式一：下载源码安装
 
+```bash
+git clone -b pre-research https://gitee.com/ascend/mstt.git
+cd mstt/debug/accuracy_tools/monitor
+pip install .
 ```
-pip install git+https://gitee.com/xiangsen2/kj600.git
+
+## 快速上手
+根据需求监控相应对象。比如在loss上扬，grad norm正常的异常训练过程中，优先考虑监控模型前向过程；在grad norm异常的训练过程中，监控权重和激活值的梯度。
+推荐使用方式：权重梯度的监控性能损耗小（20B模型全量权重梯度监控，时间增加<1%，内存增加<1%），可以长期开启。激活值监控性能损耗大(时间增加40%)，在必要时开启或者监控部分激活值。
+
+### 工具使能
+在训练脚本中使能工具（Megatron-LM），在配置文件（json）中控制工具行为。
+```python
+from monitor.module_hook import TrainerMon
+hooker = TrainerMon("./monitor_config.json", process_group=None, params_have_main_grad=True) # 在配置文件控制工具行为
+
+model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+        model_provider, model_type) 
+# 模型、优化器初始化后使能工具
+
+hooker.monitor_gnorm_with_ad(
+        model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size, optimizer=optimizer, dp_group=mpu.get_data_parallel_group(), tp_group=mpu.get_tensor_model_parallel_group())
+
+# 可以在任意位置获取当前的梯度统计量, 不同调用位置不能保证reduce已完成
+reduced, unreduced = hooker.generate_wgrad_metrics()
 ```
 
-方式二：下载源码安装
+| 字段名                                                       | 是否必选    | 解释   | 
+| ------------------------------------------------------------ | -------- | -------- | 
+|"grad_acc_steps"| 必选 |梯度累积的步数，当micro step=grad acc steps时，会触发反向hook获取模型梯度|
+|"optimizer"| 可选 |各种并行域reduce后的梯度在opt.step前获取，数据写入在step后进行。默认patch pytorch的优化器，传入其他优化器（如MegatronOptimizer）可以调整工具行为，如clip_grad发生在megatron的优化器中，pytorch的优化器之前。|
+|"dp_group"| 可选 |训练过程中的dp_group。dp域通信后，group内所有rank的梯度相同，落盘数据冗余。提供dp_group后，工具仅保留每个dp_group的第一个rank的梯度|
+|"tp_group"| 可选 |训练过程中的tp_group。tp域通信后，group内部分参数所有rank的梯度相同，落盘数据冗余。提供tp_group后，工具仅保留每个tp_group中冗余参数在第一个rank的梯度。当前适配Megatron core_v0.6.0, 通过权重属性`tensor_model_parallel`判断是否冗余|
+
+### 权重梯度监控
+- 工具配置：
+```json
+{  
+    "targets": {
+        "module": {},
+        "module.module.language_model.encoder.layers.0": {}
+    },
+    "print_struct": false, # 若不了解模型结构，可以打开print_struct打印模型结构
+    "wg_distribution": true,
+    "format": "csv", # 如果不需要落盘文件，设置为 "api"
+    "ops": ["norm", "min", "max", "mean", "nans"]
+}  
+```
+`targets`中指定module包含的所有权重都会被监控。如`module`可以覆盖全量梯度。
 
+- 结果验证
+训练日志中通常会打屏一个训练步的grad norm。提供了脚本校验落盘csv数据和打屏信息的一致性。
+```bash
+python monitor/unittest/test_monitor.py -m monitor_output/Aug13_02-27-5 -l logs/train_gpt3_TP2_PP1_CP1_monitor.log -d 2 -t 2
 ```
-git clone https://gitee.com/xiangsen2/kj600.git
-cd kj600
-pip install .
+`-m`指定落盘csv的路径前缀。`-l`指定训练日志。脚本通过关键词`grad norm: `匹配训练日志中的grad norm，根据实际情况修改。从落盘数据计算的grad norm和日志中的grad norm相对偏差超过1%，会有警告。`-d`、`--dp_size`声明data parallel size，`-t`、`--tp_size`声明tensor paralllel size。
+示例输出：
+```txt
+rank 2 is duplicated in dp group
+rank 3 is duplicated in dp group
+grad norm in consiste between training log and reduced gradients monitored
+grad mean is in consisten between unreduced grad and reduced grad monitord.
+```
+需要提供并行相关参数，具体参见:
+```bash
+python monitor/unittest/test_monitor.py -h
 ```
 
-#  快速上手
+### 激活值监控
+- 工具配置
+监控指定module的输入输出。主要需配置target字段，格式为`{<module_name>:{<field>:<format>}}`。`field`可以是["input","output"], format表示激活值的类型，常见且工具支持的有[`tensor`,`tuple`]。当module激活值为`tuple`时，需要指定tuple中的index进行监控。如`tuple[2]:0`表示该激活值是长度为2的tuple，需要监控第0个元素（缺省为0）。
+```json
+{  
+    "targets": {
+        "module.module.language_model.encoder.layers.0": {
+            "input": "tuple[2]", 
+            "output": "tensor"
+            }
+    },
+    "print_struct": false, # 若不了解模型结构，可以打开print_struct打印模型结构
+    "xy_distribution": true,
+    "forward_only": true,
+    "all_xy": true, # 需要监控全量激活值时，一个个填写targets较为繁琐，可以设置该字段使能。
+    "format": "csv", # 如果不需要落盘文件，设置为 "api"
+    "ops": ["norm", "min", "max", "mean"]
+}  
+```
 
-   下面以Ascend/ModelLink训练框架为例，给出kj600工具的使用方法。
+```json
 
-1. 在ModelLink的根目录，创建json配置文件，如llama2_config.json，内容如下：
+```
+
+### 功能重载
+- 统计量
+可以在训练过程中修改`TrainerMon`实例的`ops`属性, 调整监控的统计量。
+```python
+if {some condition}:
+    hooker.ops = ["min", "max"]
+```
 
+- 训练过程中开关激活值监控
+激活值监控的性能损耗较大, 推荐在必要时开启, 比如发现loss出现尖刺, 根据loss的异常开启激活值监控.
+```python
+if {some condition}:
+    hooker.reload_xy(xy_distribution=True)
 ```
+
+### 落盘路径
+通过json文件中的`format`字段控制格式，通过环境变量`MONITOR_OUTPUT_DIR`指定路径。
+```bash
+export MONITOR_OUTPUT_DIR=/xxx/output_dir
+```
+- format: tensorboard(default) / csv
+监控结果写入tensorboard的event文件/csv中，设置输出路径（默认为`monitor_output`，通过环境变量配置）
+
+- format: api
+监控结果不落盘，在训练过程中可以通过`generate_wgrad_metrics`和`generate_xy_metrics`接口获取。
+
+
+### 梯度异常时序判断
+0. 训练前配置相关参数
+工具支持自动判断训练过程中的梯度异常，需要在配置文件中设置alert相关字段。`AnomalyTurbulence`会将当前数值与历史均值比较，如果相对偏差超过阈值，会在打屏信息中提示用户。如果打开`dump`选项，则会将异常梯度相关信息落盘，用于后续时序判断。
+```json
+    "alert": {
+        "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}],
+        "dump": true
+    },
+```
+1. 实例化工具时传入流水线并行group
+```python
+hooker = TrainerMon("./monitor_config.json", process_group=mpu.get_pipeline_model_parallel_group(), params_have_main_grad=True)
+```
+照常开始训练
+
+2. 进入工具路径启动异常分析脚本：
+```shell
+cd  monitor/
+python3 anomaly_analyse.py -d $MONITOR_OUTPUT_DIR/anomaly_detected
+```
+支持以下参数配置
+| 字段名                                                       | 解释    | 是否必选释   | 
+| ------ | -------- | -------- | 
+|-d 或 --data_path| 指定梯度异常落盘文件夹，梯度监控功能输出，一般为$MONITOR_OUTPUT_DIR/anomaly_detected。|是 |
+|-o 或 --out_path| 排序后的异常落盘文件地址，默认在--data_path路径下落盘一个anomaly_analyse.json文件| 否 |
+|-k 或 --topk| 指定保留前topk个异常，默认为8| 否 |
+|-s 或 --step_list| 指定分析的step范围，默认为[]| 否 |
+
+## 已知问题
+- backward hook与pp中的deallocate_output_tensor冲突. backward hook创建了output tensor的view copy, 无法再对output tensor做inplace修改. (已解决)
+
+##  详细配置
+
+   下面以Ascend/ModelLink训练框架为例，给出monitor工具的使用方法。
+
+1. 在ModelLink的根目录，创建json配置文件，如llama2_config.json，内容如下：
+
+```json
 {  
     "targets": {  
         "language_model.encoder.layers.0": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}  
@@ -54,9 +190,10 @@ pip install .
     "cc_distribution": {"enable":true, "cc_codeline":[]},
     "alert": {
         "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}],
-        "inform": {"recipient": "database", "connection_str": "mysql+pymysql://username:password@host:port/database"}
+        "dump": true
     },
-    "ops": ["min", "max", "norm", "zeros", "id"],
+    "format": "tensorboard",
+    "ops": ["min", "max", "norm", "zeros", "id", "nans"],
     "eps": 1e-8
 }  
 ```
@@ -78,9 +215,10 @@ pip install .
 |"xy_distribution"|  可选 | 若为true则会监控指定module（targets中指定）的输入输出张量。 默认为false。|
 |"mv_distribution"|  可选 | 若为true则会监控指定模块中的参数的优化器状态， 默认为false。需要在TrainerMon构造函数正确指定opt_ty. 目前只支持megatron的混合精度优化器以及megatron的分布式优化器。 Deepspeed的分布式优化器实现暂不支持。 |
 |"wg_distribution"|  可选 | 若为true则会监控指定模块的参数梯度， 默认为false。 |
-|"alert"|  必选 | · "rules": 指定自动报警的异常检测机制及其相应的阈值。目前实现的异常检测是AnomalyTurbulence。 如果统计标量超出历史均值的指定浮动范围(threshold指定， 0.5意味着上浮或者下浮50%）则在控制台打印报警信息。<br>· "inform": 自动报警需要的配置，若想关闭自动报警删掉inform的配置即可。其中"recipient"指定自动报警的通知方式，可选值为"database"或"email"，默认为"database"。<br>- 若"recipient"为"database"，则需要指定"connection_str"字段，即数据库的连接URL，默认为{"recipient":"database", "connection_str": "mysql+pymysql://username:password@host:port/database"}，若有特殊字符需要转义。<br>- 若"recipient"为"email"，则需要指定"send_email_address"-发送方邮箱地址，"receive_email_address"-接收方邮箱地址，"send_email_username"-发送方邮箱用户名，"send_email_password"-发送方邮箱密码，"smtp_server"-发送方邮箱对应的SMTP服务器，"smtp_port"-发送方邮箱对应的SMTP端口号。默认为:<br>{"recipient":"email", send_email_address": "sender@huawei.com", "receive_email_address": "receiver@huawei.com", "send_email_username": "username", "send_email_password": "******", "smtp_server": "smtpscn.huawei.com", "smtp_port": "587"}|
+|"alert"|  必选 | "rules": 指定自动报警的异常检测机制及其相应的阈值。目前实现的异常检测是AnomalyTurbulence。 如果统计标量超出历史均值的指定浮动范围(threshold指定， 0.5意味着上浮或者下浮50%）则在控制台打印报警信息。<br>|
 |"cc_distribution"|  可选 | 其中"enable"字段控制通信监控模块的开关；需要监控通信算子时，务必尽量早地实例化`TrainerMon`, 因为监控通过劫持原始func后挂hook实现，部分加速库初始化时会保存原始function，避免监控失效。"cc_codeline"字段指定监控的代码行，如:`train.py\\[23\\]`，默认为空列表，不特别指定；"cc_pre_hook"字段控制是否监控通信前的数据； 模块会在第二个optimize.step之前打印通信日志，包括通信api的调用栈、输入dtype、通信group。 "cc_log_only"为true时，仅打印日志，不监控通信的输入输出，并在打印后中断训练。可以根据通信日志设置"cc_codeline"，规避与训练过程不相关的通信，比如一些时间、metrics的同步。|
-|"ops"|  可选 |与ur_distribution、xy_distribution、mv_distribution、wg_distribution、mg_direction、cc_distribution配合，监控所选张量的min、max、norm、zeros值。其中，zeros代表监控所选张量的元素小于eps的比例，id代表监控所选的非张量本身，默认为[]。|
+|"format"| 可选 | 数据落盘格式，默认为tensorboard，支持可选 "csv"。 |
+|"ops"|  可选 |与ur_distribution、xy_distribution、mv_distribution、wg_distribution、mg_direction、cc_distribution配合，监控所选张量的min、max、norm、zeros、nans值。其中，zeros代表监控所选张量的元素小于eps的比例，id代表监控所选的非张量本身，默认为[]。|
 |"eps"|  可选 |若ops里包含"zeros"则需要配置，默认为1e-8。|
 
 下面给出transformer架构模型中常见的module的前向计算的输入输出和反向计算输入张量的梯度和输出张量的梯度格式，以供参考：
@@ -101,7 +239,7 @@ pip install .
 
 对于language_model.embedding.word_embeddings这类输入层，我们不关心输入的情况下，可以不填"input"和"input_grad"，监控的状态中不会包含输入的相关信息。config文件示例如下：
 
-```
+```json
 {  
     "targets": {  
         "language_model.embedding.word_embeddings": {"output": "tensor","output_grad":"tuple[1]:0"}  
@@ -109,57 +247,57 @@ pip install .
 }  
 ```
 
-2. 在训练器中加入代码，开启kj600训练监控。
+2. 在训练器中加入代码，开启monitor训练监控。
 
    例如在ModelLink/pretrain_gpt.py的model_provider GPTModel构造后加入以下代码,  **注意优化器类型opt_ty** ：
 
-   ```
-       from kj600.module_hook import TrainerMon
-       hooker = TrainerMon("./llama2_config.json", params_have_main_grad=True, opt_ty="Megatron_DistributedOptimizer") # or opt_ty=Megatron_Float16OptimizerWithFloat16Params
+   ```python
+       from monitor.module_hook import TrainerMon
+       hooker = TrainerMon("./llama2_config.json", process_group=None, params_have_main_grad=True, opt_ty="Megatron_DistributedOptimizer") # or opt_ty=Megatron_Float16OptimizerWithFloat16Params
        hooker.hook_modules(model=model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size) 
    ```
    params_have_main_grad: 若为True则参数权重梯度为main_grad，否则为grad，默认为True。
    
    如果不是Megatron-LM的训练框架， 可以设置对应的梯度累积步数grad_acc_steps。 
 
-   如果要监控混合精度优化器的动量和方差， 需要在混合精度优化器构造后加入如下代码。 目前只支持Megatron_DistributedOptimizer， 使用bf16或者fp16混合精度时开启分布式优化器。 或者Megatron_Float16OptimizerWithFloat16Params， 使用bf16或者fp16混合精度选项并且不开启分布式优化器。 
+   如果要监控优化器的动量和方差，需要在优化器构造后加入如下代码。 目前支持Megatron实现的优化器：
+   - Megatron_FP32OptimizerMon，普通优化器。
+   - Megatron_Float16OptimizerWithFloat16Params， 使用bf16或者fp16混合精度选项并且不开启分布式优化器。
+   - Megatron_DistributedOptimizer， 使用bf16或者fp16混合精度时开启分布式优化器。 
 
-   ```
+   ```python
    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
            model_provider, model_type) 
    # 插入位置
-   from kj600.module_hook import TrainerMon
+   from monitor.module_hook import TrainerMon
    TrainerMon.set_wrapped_optimizer(optimizer)
    ```
 
 3. 配置tensorboard写入的目录
 
-   ```
-   export KJ600_OUTPUT_DIR=/xxx/output_dir
+   ```shell
+   export MONITOR_OUTPUT_DIR=/xxx/output_dir
    ```
 
 4. 开始预训练，在日志中如果发现以下内容， 则说明指定的模块被成功监视。
 
-   ```
+   ```txt
    > language_model.encoder.layers.0 is monitored successfully
    > 1 out of 1 modules are monitored.
    ```
 
 5. 训练过程中，打开tensorboard，可以查看训练的中间状态：
 
-```
-tensorboard --logdir=$KJ600_OUTPUT_DIR
+```shell
+tensorboard --logdir=$MONITOR_OUTPUT_DIR
 ```
 
 之后，运行以下SSH命令来建立端口转发，可以在本地通过http://localhost:6006访问tensorboard：
 
-```
+```shell
 ssh -N -L localhost:6006:localhost:6006 your_username@remote_server_address
 ```
 
-# 高级用法
-TBD
-
 # 公开接口
 
 **接口说明**
@@ -171,6 +309,7 @@ TrainerMon.__init__(config_file_path, params_have_main_grad=True, opt_ty=None) -
 | 参数  | 说明                  | 是否必选 |
 | ----- | -------------------- | -------- |
 | config_file_path |自己写的json配置文件路径。 | 是       |
+| process_group | 传入ProcessGroup对象，用以确定pipeline并行不同rank异常间时序，megatron下通过core.parallel_state.get_pipeline_model_parallel_group()获得 | 否       |
 | params_have_main_grad |权重是否使用main_grad，是就为True，否则为False。默认为True。 | 否       |
 | opt_ty |优化器类型，有两个选项，Megatron_DistributedOptimizer：使用bf16或者fp16混合精度时开启分布式优化器；Megatron_Float16OptimizerWithFloat16Params：使用bf16或者fp16混合精度选项并且不开启分布式优化器，也适用于常规的adam优化器。如果使用的不是adam优化器，使用None。默认为None。 | 否      |
 
diff --git a/debug/accuracy_tools/kj600/img/cpu_info.png b/debug/accuracy_tools/monitor/img/cpu_info.png
similarity index 100%
rename from debug/accuracy_tools/kj600/img/cpu_info.png
rename to debug/accuracy_tools/monitor/img/cpu_info.png
diff --git a/debug/accuracy_tools/kj600/img/train.png b/debug/accuracy_tools/monitor/img/train.png
similarity index 100%
rename from debug/accuracy_tools/kj600/img/train.png
rename to debug/accuracy_tools/monitor/img/train.png
diff --git a/debug/accuracy_tools/kj600/img/train_with_kj600.png b/debug/accuracy_tools/monitor/img/train_with_kj600.png
similarity index 100%
rename from debug/accuracy_tools/kj600/img/train_with_kj600.png
rename to debug/accuracy_tools/monitor/img/train_with_kj600.png
diff --git a/debug/accuracy_tools/kj600/kj600/__init__.py b/debug/accuracy_tools/monitor/monitor/__init__.py
similarity index 100%
rename from debug/accuracy_tools/kj600/kj600/__init__.py
rename to debug/accuracy_tools/monitor/monitor/__init__.py
diff --git a/debug/accuracy_tools/monitor/monitor/anomaly_analyse.py b/debug/accuracy_tools/monitor/monitor/anomaly_analyse.py
new file mode 100644
index 0000000000000000000000000000000000000000..23ca4f3e23ef93e327bd809e29789e2c83450b56
--- /dev/null
+++ b/debug/accuracy_tools/monitor/monitor/anomaly_analyse.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import argparse
+import ast
+import fcntl
+import heapq
+import json
+import os
+from pathlib import Path
+import sys
+
+from monitor.utils import print_info_log, print_warn_log
+from monitor.anomaly_detect import GradAnomalyData
+from monitor.file_check import (
+    change_mode,
+    check_link,
+    FileCheckConst,
+    check_path_before_create,
+    FileChecker,
+    FileOpen,
+)
+
+ANOMALY_JSON = "anomaly.json"
+ANALYSE_JSON = "anomaly_analyse.json"
+
+class AnomalyDataWriter:
+    """
+    异常数据写入类，负责将异常数据写入到JSON文件中。
+    """
+
+    def __init__(self, dump_path, rank) -> None:
+        self.dump_path = dump_path
+        self.dump_rank_dir = os.path.join(self.dump_path, f"rank{rank}")
+        self.json_path = os.path.join(self.dump_rank_dir, ANOMALY_JSON)
+
+    @staticmethod
+    def get_anomaly_dict(anomalies):
+        """将GradAnomalyData列表转换为json"""
+        anomalies_json = {}
+        for anomaly in anomalies:
+            anomalies_json.update({anomaly.get_key(): anomaly.to_dict()})
+        return anomalies_json
+
+    @staticmethod
+    def update_data_in_single_json(json_path, anomalies_data):
+        with FileOpen(json_path, "w+") as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            json.dump(anomalies_data, f, indent=1)
+            fcntl.flock(f, fcntl.LOCK_UN)
+        change_mode(json_path, FileCheckConst.DATA_FILE_AUTHORITY)
+
+    def init_detected_json(self):
+        """初始化落盘文件"""
+        check_path_before_create(self.dump_path)
+        if not os.path.exists(self.dump_path):
+            Path(self.dump_path).mkdir(
+                mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True
+            )
+        file_check = FileChecker(self.dump_path, FileCheckConst.DIR)
+        file_check.common_check()
+
+        if not os.path.exists(self.dump_rank_dir):
+            Path(self.dump_rank_dir).mkdir(
+                FileCheckConst.DATA_DIR_AUTHORITY, parents=True, exist_ok=True
+            )
+
+        if os.path.exists(self.json_path):
+            file_check = FileChecker(
+                self.json_path, FileCheckConst.FILE, FileCheckConst.WRITE_ABLE
+            )
+            file_check.common_check()
+            print_warn_log(f"The existing file will be deleted: {self.json_path}.")
+            os.remove(self.json_path)
+        Path(self.json_path).touch()
+        change_mode(self.json_path, FileCheckConst.DATA_FILE_AUTHORITY)
+
+    def write_detected_json(self, anomalies):
+        """
+        落盘异常数据
+        Args:
+        anomalies: GradAnomalyData对象列表
+        """
+        anomalies_json = self.get_anomaly_dict(anomalies)
+        print_info_log(f"{ANOMALY_JSON} is at {self.dump_rank_dir}.")
+        if Path(self.json_path).exists() and os.path.getsize(self.json_path) > 0:
+            with FileOpen(self.json_path, "r+") as f:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                data_to_write = json.load(f)
+                fcntl.flock(f, fcntl.LOCK_UN)
+        else:
+            data_to_write = {}
+        data_to_write.update(anomalies_json)
+        self.update_data_in_single_json(self.json_path, data_to_write)
+
+
+class AnomalyDataLoader:
+    def __init__(self, data_path) -> None:
+        self.data_path = data_path
+
+    @staticmethod
+    def create_instances_from_dict(anomalies_dict: dict):
+        instances = []
+        for values in anomalies_dict.values():
+            try:
+                instances.append(GradAnomalyData(**values))
+            except KeyError as e:
+                print_warn_log(f"Missing key in anomaly data: {e}")
+            except ValueError as e:
+                print_warn_log(
+                    f"Value error when creating a GradAnomalyData instance: {e}"
+                )
+        return instances
+
+    def get_anomalies_from_jsons(self):
+        """遍历文件夹,从rankK/anomaly.json中读取异常数据
+        return: anomalies: GradAnomalyData对象列表
+        """
+        anomalies = []
+        check_link(self.data_path)
+        for rank_dir in os.listdir(self.data_path):
+            rank_path = os.path.join(self.data_path, rank_dir)
+            if not os.path.isdir(rank_path):
+                continue
+            json_path = os.path.join(rank_path, ANOMALY_JSON)
+            if not os.path.exists(json_path):
+                continue
+            with FileOpen(json_path, "r+") as f:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                data_anomalies = json.load(f)
+                fcntl.flock(f, fcntl.LOCK_UN)
+            instances = self.create_instances_from_dict(data_anomalies)
+            anomalies.extend(instances)
+        return anomalies
+
+
+class AnomalyAnalyse:
+    def __init__(self) -> None:
+        self.sorted_anomalies = []
+
+    def get_range_top_K(self, topk, step_list, anomalies):
+        """
+        获取前topk个step_list范围内的异常。
+        """
+        if not step_list:
+            filtered_anomalies = anomalies
+        else:
+            filtered_anomalies = [
+                anomaly for anomaly in anomalies if anomaly.step in step_list
+            ]
+        if topk >= len(filtered_anomalies):
+            self.sorted_anomalies = sorted(filtered_anomalies)
+        else:
+            self.sorted_anomalies = list(heapq.nsmallest(topk, filtered_anomalies))
+        return self.sorted_anomalies
+
+    def rewrite_sorted_anomalies(self, output_path):
+        """
+        将排序后的异常数据重新落盘
+        """
+        file_check = FileChecker(
+            output_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE
+        )
+        file_check.common_check()
+
+        sorted_data = AnomalyDataWriter.get_anomaly_dict(self.sorted_anomalies)
+        print_info_log(f"{ANALYSE_JSON} is at {output_path}.")
+        json_path = os.path.join(output_path, ANALYSE_JSON)
+        if os.path.exists(json_path):
+            file_check = FileChecker(
+                json_path, FileCheckConst.FILE, FileCheckConst.WRITE_ABLE
+            )
+            file_check.common_check()
+            print_warn_log(f"The existing file will be deleted: {json_path}.")
+            os.remove(json_path)
+        Path(json_path).touch()
+        change_mode(json_path, FileCheckConst.DATA_FILE_AUTHORITY)
+        AnomalyDataWriter.update_data_in_single_json(json_path, sorted_data)
+
+
+def _get_parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--data_path", dest="data_path_dir", default="./", type=str,
+        help="<Required> The anomaly detect result dictionary: generate from monitor tool.",
+        required=True,
+    )
+    parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str,
+        help="<optional> The analyse task result out path.",
+        required=False,
+    )
+    parser.add_argument("-k", "--topk", dest="top_k_number", default=8, type=int,
+        help="<optional> Top K number of earliest anomalies.",
+        required=False,
+    )
+    parser.add_argument("-s", "--step", dest="step_list", default="[]", type=str,
+        help="<optional> Analyse which steps.",
+        required=False,
+    )
+    return parser.parse_args(sys.argv[1:])
+
+def _get_step_and_stop(args):
+    try:
+        step_list = ast.literal_eval(args.step_list)
+        if not isinstance(step_list, list):
+            raise ValueError(f"{args.step_list} is not a list")
+    except (ValueError, SyntaxError, RecursionError) as e:
+        raise Exception(
+            f"The step list must be a resolvable list type"
+        ) from e
+    if args.top_k_number <= 0:
+        raise Exception("The top k number must be greater than 0.")
+    return step_list, args.top_k_number
+
+def _anomaly_analyse():
+    args = _get_parse_args()
+    step_list, top_k_number = _get_step_and_stop(args)
+    loader = AnomalyDataLoader(args.data_path_dir)
+    anomalies = loader.get_anomalies_from_jsons()
+    analyser = AnomalyAnalyse()
+    top_anomalies = analyser.get_range_top_K(
+        top_k_number, step_list, anomalies
+    )
+    analyser.rewrite_sorted_anomalies(
+        args.out_path if args.out_path else args.data_path_dir
+    )
+
+    print_info_log(f"Top {top_k_number} anomalies are listed as follows:")
+    for index, anomaly in enumerate(top_anomalies):
+        print_info_log(f"{index}: {anomaly.message}")
+
+
+if __name__ == "__main__":
+    _anomaly_analyse()
+    print_info_log("Analyse task completed.")
diff --git a/debug/accuracy_tools/monitor/monitor/anomaly_detect.py b/debug/accuracy_tools/monitor/monitor/anomaly_detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb8d6ee82cdc0bf8668b0531ea6ce06e071e0da0
--- /dev/null
+++ b/debug/accuracy_tools/monitor/monitor/anomaly_detect.py
@@ -0,0 +1,247 @@
+import os
+import sys
+import statistics  as st
+from abc import ABC
+from typing import List
+from collections import defaultdict
+from dataclasses import dataclass, field
+import pandas as pd
+from torch.utils.tensorboard import SummaryWriter
+from monitor.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory
+from monitor.const import Const
+from monitor.file_check import change_mode, FileCheckConst
+
+
+class ScanRule(ABC):
+    def apply(self, history, cur):
+        raise NotImplementedError("abstract method apply is not implemented")
+
+class AnomalyTurbulence(ScanRule):
+    name = "AnomalyTurbulence"
+    def __init__(self, threshold) -> None:
+        self.threshold = threshold
+    def apply(self, history, cur):
+        baseline = st.mean(history) if isinstance(history, list) else history
+        
+        up_bound = baseline + baseline * self.threshold
+        if baseline > 0:
+            return cur > up_bound
+        else:
+            return cur < up_bound
+
+class AnomalyScanner:
+
+    @staticmethod
+    def load_rules(specs: List[dict]):
+        if specs is None:
+            return []
+        alert_rules = []
+        for spec in specs:
+            rule_cls_name = spec["rule_name"]
+            rule_args = spec["args"]
+            cur_module = sys.modules[__name__]
+            rule_cls = getattr(cur_module, rule_cls_name)
+            rule_instance = rule_cls(**rule_args)
+            alert_rules.append(rule_instance)
+        return alert_rules
+
+    @staticmethod
+    def scan(scan_rules: List[ScanRule], history, cur):
+        anomaly = False
+        for rule in scan_rules:
+            anomaly = rule.apply(history, cur)
+            if anomaly:
+                return anomaly, rule.name
+        return anomaly, None
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+class AnomalyDataFactory(ABC):
+    def __init__(self, rank, pp_stage, group_mates):
+        super().__init__()
+        self.rank = rank
+        self.pp_stage = pp_stage
+        self.group_mates = group_mates
+        self.micro_step = 0
+        self.vpp_stage = 0
+        self.name2callid = {}
+    
+    def set_call_id(self, name2callid):
+        """根据当前GradContext信息更新call_id vpp_stage等信息
+        """
+        self.name2callid = name2callid
+
+    def create(self, tag_name, message, step):
+        """如果检查出异常, 调用当前接口生成GradAnomalyData实例
+        """
+        param_name = tag_name.split('/')[0]
+        call_id = self.name2callid.get(param_name,-1)
+        if Const.VPP_SEP in param_name:
+            vpp_stage = int(param_name.split(Const.VPP_SEP)[0])
+        else:
+            vpp_stage = 0
+
+        return GradAnomalyData(
+            self.rank,
+            step,
+            self.micro_step,
+            self.pp_stage,
+            self.vpp_stage,
+            call_id,
+            tag_name,
+            message,
+            self.group_mates
+        )
+    
+@dataclass(eq=True)
+class GradAnomalyData:
+    rank: int = 0
+    step: int = 0
+    micro_step: int = 0
+    pp_stage: int = 0
+    vpp_stage: int = 0
+    call_id: int = 0
+    tag_name: str = field(default=None, compare=False)
+    message: str = field(default="", compare=False)
+    group_mates: list = field(default=None, compare=False)
+
+    def __lt__(self, other):
+        if not isinstance(other, GradAnomalyData):
+            return NotImplemented
+        if self.step != other.step:
+            return self.step < other.step
+        if self.micro_step != other.micro_step:
+            return self.micro_step < other.micro_step
+        if self.pp_stage != other.pp_stage:
+            return self.pp_stage > other.pp_stage
+        if self.vpp_stage != other.vpp_stage:
+            return self.vpp_stage > other.vpp_stage
+        if self.call_id != other.call_id:
+            return self.call_id < other.call_id
+        return False
+    
+    def __le__(self, other):
+        if not isinstance(other, GradAnomalyData):
+            return NotImplemented
+        return self == other or self < other
+    
+    def to_dict(self):
+        return self.__dict__
+    
+    def get_key(self):
+        return ''.join(
+            (str(self.tag_name), "_step_", str(self.step),  "_call_" , str(self.call_id)))
+
+class BaseWriterWithAD:
+    def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6):
+        self.tag2scalars = {}
+        self.ad_rules = ad_rules
+        self.job_id = job_id
+        self.anomaly_inform = anomaly_inform
+        self.anomaly_factory = anomaly_factory
+        self.anomalies = []
+        self.ndigits = ndigits
+
+    def get_anomalies(self):
+        """返回已检测到的异常列表
+        """
+        return self.anomalies
+
+    def clear_anomalies(self):
+        self.anomalies.clear()
+
+    def add_scalar(self, tag, scalar_value, global_step=None):
+        avg = self._update_tag2scalars(tag, scalar_value)
+        detected, rule_name = self._ad(scalar_value, history=avg)
+        if detected:
+            exception_message = f"Rule {rule_name} reports anomaly signal in {tag} at step {global_step}."
+            print_info_log(f"{bcolors.WARNING}> {exception_message}{bcolors.ENDC}")
+            if self.anomaly_inform:
+                self.anomaly_inform.run(exception_message, self.job_id)
+
+            if self.anomaly_factory:
+                self.anomalies.append(self.anomaly_factory.create(tag, exception_message, global_step))
+
+    def _ad(self, scalar_value, history):
+        return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value)
+
+    def _update_tag2scalars(self, tag, scalar_value):
+        """Update the average and count of a scalar value associated with a tag.
+
+        This method is used to maintain a running average of scalar values for each tag.
+
+
+        Args:
+            tag (str): The tag identifier.
+            scalar_value (float): The scalar value to be added.
+
+        Returns:
+            float: The average value before update.
+        """
+        if tag not in self.tag2scalars:
+            self.tag2scalars[tag] = {'avg': scalar_value, 'count': 0}
+        avg = self.tag2scalars[tag]['avg']
+        new_avg = (avg * self.tag2scalars[tag]['count'] + scalar_value) / (self.tag2scalars[tag]['count'] + 1)
+        self.tag2scalars[tag]['avg'] = new_avg
+        self.tag2scalars[tag]['count'] += 1
+        return avg
+
+
+class CSVWriterWithAD(BaseWriterWithAD):
+    def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6):
+        super().__init__(path, ad_rules, job_id, anomaly_inform, anomaly_factory, ndigits)
+
+        self.log_dir = path
+        create_directory(path)
+        self.context_dict = defaultdict(list)
+        self.header = []
+        
+    def write_csv(self, prefix, step):
+        if len(self.context_dict) == 0:
+            return
+        filepath = os.path.join(self.log_dir, f'{prefix}_{step}.csv')
+        if not os.path.exists(filepath):
+            make_file_safety(filepath)
+            data_frame = pd.DataFrame(columns=self.header)
+            data_frame.to_csv(filepath, index=False)
+            change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
+
+        check_file_valid_writable(filepath)
+        new_data = []
+        for name, metric_value in self.context_dict.items():
+            if Const.VPP_SEP not in name:
+                new_data.append([name] + metric_value)
+            else:
+                new_data.append(name.split(Const.VPP_SEP) + metric_value)
+        new_data = pd.DataFrame(new_data)
+        new_data.to_csv(filepath, mode='a+', header=False, index=False)
+        self.context_dict = defaultdict(list)
+    
+    def add_scalar(self, tag, scalar_value, global_step):
+        super().add_scalar(tag, scalar_value, global_step)
+
+        name = tag.split('/')[0]
+        self.context_dict[name].append(round(scalar_value, self.ndigits))
+
+    def close(self):
+        pass
+    
+class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD):
+    def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6): 
+        super(SummaryWriter, self).__init__(path, ad_rules, job_id, anomaly_inform, anomaly_factory, ndigits)
+        super().__init__(path)
+        change_mode(path, FileCheckConst.DATA_DIR_AUTHORITY)
+    
+    def add_scalar(self, tag, scalar_value, global_step):
+        super(SummaryWriter, self).add_scalar(tag, scalar_value, global_step)
+        return super().add_scalar(tag, scalar_value, global_step)
+    
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_inform.py b/debug/accuracy_tools/monitor/monitor/anomaly_inform.py
similarity index 95%
rename from debug/accuracy_tools/kj600/kj600/anomaly_inform.py
rename to debug/accuracy_tools/monitor/monitor/anomaly_inform.py
index 301ac769217943a36e5d4cbe06033c828e5c675e..fe0fdb3f105cf3f400eee91734e37048fbaa3ca0 100644
--- a/debug/accuracy_tools/kj600/kj600/anomaly_inform.py
+++ b/debug/accuracy_tools/monitor/monitor/anomaly_inform.py
@@ -1,18 +1,17 @@
 import smtplib
 from email.mime.text import MIMEText
-import sqlite3
 from datetime import datetime, timedelta
 
-from kj600.database import Database, ExceptionMessage
+from monitor.database import Database, ExceptionMessage
 
 
 # define class InformRegistry to get inform_sub_class
 class AnomalyInformFactory:
     @staticmethod
     def create_informer(**kwargs):
-        if kwargs['recipient'] == "database":
+        if kwargs.get('recipient') == "database":
             return DatabaseInform(**kwargs)
-        elif kwargs['recipient'] == "email":
+        elif kwargs.get('recipient') == "email":
             return EmailInform(**kwargs)
         else:
             raise ValueError("Invaild recipient specified")
diff --git a/debug/accuracy_tools/kj600/kj600/config.json b/debug/accuracy_tools/monitor/monitor/config.json
similarity index 100%
rename from debug/accuracy_tools/kj600/kj600/config.json
rename to debug/accuracy_tools/monitor/monitor/config.json
diff --git a/debug/accuracy_tools/monitor/monitor/const.py b/debug/accuracy_tools/monitor/monitor/const.py
new file mode 100644
index 0000000000000000000000000000000000000000..095356631d43e163c500178197250bcc5bbf3f7a
--- /dev/null
+++ b/debug/accuracy_tools/monitor/monitor/const.py
@@ -0,0 +1,12 @@
+
+class Const:
+    VPP_SEP = ":"
+    ACTV_IN = "input"
+    ACTV_OUT = "output"
+    ACTVGRAD_IN = "input_grad"
+    ACTVGRAD_OUT = "output_grad"
+
+    OP_LIST = ['min', 'max', 'norm', 'mean', 'id', 'zeros', 'nans']
+
+    DEEPSPEED_OPT_TY = ("DeepSpeedZeroOptimizer_Stage1_or_2", "DeepSpeedZeroOptimizer_Stage3")
+    
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/database.py b/debug/accuracy_tools/monitor/monitor/database.py
similarity index 100%
rename from debug/accuracy_tools/kj600/kj600/database.py
rename to debug/accuracy_tools/monitor/monitor/database.py
diff --git a/debug/accuracy_tools/kj600/kj600/distributed/distributed_ops.yaml b/debug/accuracy_tools/monitor/monitor/distributed/distributed_ops.yaml
similarity index 100%
rename from debug/accuracy_tools/kj600/kj600/distributed/distributed_ops.yaml
rename to debug/accuracy_tools/monitor/monitor/distributed/distributed_ops.yaml
diff --git a/debug/accuracy_tools/kj600/kj600/distributed/stack_blacklist.yaml b/debug/accuracy_tools/monitor/monitor/distributed/stack_blacklist.yaml
similarity index 77%
rename from debug/accuracy_tools/kj600/kj600/distributed/stack_blacklist.yaml
rename to debug/accuracy_tools/monitor/monitor/distributed/stack_blacklist.yaml
index 00b0013619fcfa1445a8df18c3c7d16764fb4872..40692d6942c39dfd5bbe52d33df6de4f1238eab2 100644
--- a/debug/accuracy_tools/kj600/kj600/distributed/stack_blacklist.yaml
+++ b/debug/accuracy_tools/monitor/monitor/distributed/stack_blacklist.yaml
@@ -1,5 +1,5 @@
 stack:
-- kj600/distributed
+- monitor/distributed
 - site-packages/torch/nn/modules/module.py
 - multiprocessing
 - debugpy
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py b/debug/accuracy_tools/monitor/monitor/distributed/wrap_distributed.py
similarity index 89%
rename from debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py
rename to debug/accuracy_tools/monitor/monitor/distributed/wrap_distributed.py
index 4e2d5e175ef0b24ab42990325b83cc2068a9c8e2..1b82c5704fc62ece14ef30f67b45b4a41433d875 100644
--- a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py
+++ b/debug/accuracy_tools/monitor/monitor/distributed/wrap_distributed.py
@@ -1,7 +1,8 @@
 import os
-import yaml
 import re
 import inspect
+
+import yaml
 import torch
 import torch.nn as nn
 import torch.distributed as dist
@@ -77,6 +78,21 @@ class ApiRegistry:
             else:
                 setattr(api_group, cc_api_name, cc_api_entry_func)
 
+    @staticmethod
+    def redirect_wait():
+        global ORIGIN_WAIT
+        global PENDING_ASYNC_CC_BY_HANDLE
+
+        def wrapped_wait(work):
+            def wrapped_wait(*args, **kwargs):
+                ORIGIN_WAIT(*args, **kwargs)
+                if args[0] in PENDING_ASYNC_CC_BY_HANDLE:
+                    store_func = PENDING_ASYNC_CC_BY_HANDLE.pop(args[0])
+                    store_func()
+            return wrapped_wait
+
+        dist.Work.wait = wrapped_wait(dist.Work)
+
     def redirect_api(self):
         self.set_api_attr(dist, self.distributed_attr_hooked)
         self.set_api_attr(dist.distributed_c10d, self.distributed_attr_hooked)
@@ -92,19 +108,12 @@ class ApiRegistry:
         for op_name in get_distributed_ops():
             self.distributed_attr_hooked[op_name] = DistributedOPTemplate(op_name, pre_hooks, post_hooks)
 
-    def redirect_wait(self):
-        global ORIGIN_WAIT
-        global PENDING_ASYNC_CC_BY_HANDLE
-
-        def wrapped_wait(work):
-            def wrapped_wait(*args, **kwargs):
-                ORIGIN_WAIT(*args, **kwargs)
-                if args[0] in PENDING_ASYNC_CC_BY_HANDLE:
-                    store_func = PENDING_ASYNC_CC_BY_HANDLE.pop(args[0])
-                    store_func()
-            return wrapped_wait
-
-        dist.Work.wait = wrapped_wait(dist.Work)
+def get_process_group(process_group):
+    return (
+        process_group 
+        if isinstance(process_group, dist.ProcessGroup) 
+        else dist.GroupMember.WORLD
+    )
 
 
 def stack_filter(stack):
@@ -115,7 +124,7 @@ def stack_filter(stack):
 
 def get_callstack():
     callstack = []
-    for (_, path, line, func, code, _) in inspect.stack():
+    for (_, path, line, func, _, _) in inspect.stack():
         stack_line = f'{path}[{line}]'
         if stack_filter(stack_line):
             callstack.append(stack_line+'   '+func)
@@ -180,33 +189,12 @@ def create_async_callback_func(context, ops, args, prefix):
         catch_data(context, ops, args, prefix)
     return store_data
 
-def get_tensor_dtype(args):
-    dtypes = []
-    for arg in args:
-        if isinstance(arg, torch.Tensor):
-            dtypes.append(arg.dtype)
-        else:
-            dtypes.append(None)
-    return dtypes
-
-def get_group_members(args):
-    group = None
-    for arg in args:
-        if isinstance(arg, dist.ProcessGroup):
-            group = arg
-    if group is None:
-        group = dist.GroupMember.WORLD
-    return dist.get_process_group_ranks(group)
-
 
 def create_hooks(context, monitor):
 
     def cc_log_hook(module, args, kwargs):
-        all_args = args + tuple(kwargs.values())
-        dtypes = '|'.join([str(i) if i else '' for i in get_tensor_dtype(all_args)])
         stack = ';'.join(get_callstack())
-        group_members = '|'.join([str(i) for i in get_group_members(all_args)])
-        monitor.cc_logged_stack[module.op_name_].add(';'.join([dtypes, group_members, stack]))
+        monitor.cc_logged_stack[module.op_name_].add(stack)
         return
         
     def cc_pre_hook(module, args, kwargs): 
@@ -235,8 +223,8 @@ def create_hooks(context, monitor):
     if (dist.is_initialized() and dist.get_rank() not in monitor.module_rank_list and monitor.module_rank_list != []):
         return [pre_hooks, hooks]
     
-    pre_hooks.append(cc_log_hook)
     if monitor.cc_log_only:
+        pre_hooks.append(cc_log_hook)
         return [pre_hooks, hooks]
     
     if monitor.cc_pre_hook:
diff --git a/debug/accuracy_tools/kj600/kj600/features.py b/debug/accuracy_tools/monitor/monitor/features.py
similarity index 94%
rename from debug/accuracy_tools/kj600/kj600/features.py
rename to debug/accuracy_tools/monitor/monitor/features.py
index 7810188f7d7df66dce4c489f18062f9381b95646..bedc510dab05f074da9e6d95e52567ece7b69bbd 100644
--- a/debug/accuracy_tools/kj600/kj600/features.py
+++ b/debug/accuracy_tools/monitor/monitor/features.py
@@ -1,6 +1,6 @@
 import torch
 from torch.autograd.functional import jacobian
-from kj600.utils import print_info_log
+from monitor.utils import print_info_log
 
 
 @torch.no_grad()
@@ -11,6 +11,10 @@ def square_sum(x: torch.tensor):
 def get_min(x: torch.tensor):
     return torch.min(x)
 
+@torch.no_grad()
+def get_mean(x: torch.tensor):
+    return torch.mean(x)
+
 @torch.no_grad()
 def get_norm(x: torch.tensor):
     return torch.norm(x, p=2)
@@ -19,7 +23,6 @@ def get_norm(x: torch.tensor):
 def get_max(x: torch.tensor):
     return torch.max(x)
 
-
 @torch.no_grad()
 def get_zeros(x: torch.tensor, eps: float):
     return torch.sum(torch.abs(x) < eps) / x.numel()
diff --git a/debug/accuracy_tools/monitor/monitor/file_check.py b/debug/accuracy_tools/monitor/monitor/file_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd7d7727955648c342fc4c257ca0228beedfd939
--- /dev/null
+++ b/debug/accuracy_tools/monitor/monitor/file_check.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import re
+
+from monitor.utils import print_error_log
+
+
+class CodedException(Exception):
+    def __init__(self, code, error_info=""):
+        super().__init__()
+        self.code = code
+        self.error_info = self.err_strs.get(code) + error_info
+
+    def __str__(self):
+        return self.error_info
+
+
+class FileCheckException(CodedException):
+    INVALID_FILE_ERROR = 0
+    FILE_PERMISSION_ERROR = 1
+    SOFT_LINK_ERROR = 2
+    ILLEGAL_PATH_ERROR = 3
+    ILLEGAL_PARAM_ERROR = 4
+    FILE_TOO_LARGE_ERROR = 5
+
+    err_strs = {
+        SOFT_LINK_ERROR: "[monitor] 检测到软链接： ",
+        FILE_PERMISSION_ERROR: "[monitor] 文件权限错误： ",
+        INVALID_FILE_ERROR: "[monitor] 无效文件： ",
+        ILLEGAL_PATH_ERROR: "[monitor] 非法文件路径： ",
+        ILLEGAL_PARAM_ERROR: "[monitor] 非法打开方式： ",
+        FILE_TOO_LARGE_ERROR: "[monitor] 文件过大： ",
+    }
+
+
+class FileCheckConst:
+    """
+    Class for file check const
+    """
+
+    READ_ABLE = "read"
+    WRITE_ABLE = "write"
+    READ_WRITE_ABLE = "read and write"
+    DIRECTORY_LENGTH = 4096
+    FILE_NAME_LENGTH = 255
+    FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
+    FILE_PATTERN = r"^[a-zA-Z0-9_./-]+$"
+    JSON_SUFFIX = ".json"
+    MAX_JSON_SIZE = 1073741824  # 1 * 1024 * 1024 * 1024
+    DIR = "dir"
+    FILE = "file"
+    DATA_DIR_AUTHORITY = 0o750
+    DATA_FILE_AUTHORITY = 0o640
+    FILE_SIZE_DICT = {
+        JSON_SUFFIX: MAX_JSON_SIZE,
+    }
+
+
+class FileChecker:
+    """
+    The class for check file.
+
+    Attributes:
+        file_path: The file or dictionary path to be verified.
+        path_type: file or dictionary
+        ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability
+        file_type(str): The correct file type for file
+    """
+
+    def __init__(
+        self, file_path, path_type, ability=None, file_type=None, is_script=True
+    ):
+        self.file_path = file_path
+        self.path_type = self._check_path_type(path_type)
+        self.ability = ability
+        self.file_type = file_type
+        self.is_script = is_script
+
+    @staticmethod
+    def _check_path_type(path_type):
+        if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]:
+            print_error_log(
+                f"The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}."
+            )
+            raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
+        return path_type
+
+    def common_check(self):
+        """
+        功能：用户校验基本文件权限：软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符
+        注意：文件后缀的合法性，非通用操作，可使用其他独立接口实现
+        """
+        check_path_exists(self.file_path)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        check_path_type(self.file_path, self.path_type)
+        self.check_path_ability()
+        if self.is_script:
+            check_path_owner_consistent(self.file_path)
+        check_path_pattern_vaild(self.file_path)
+        check_common_file_size(self.file_path)
+        check_file_suffix(self.file_path, self.file_type)
+        return self.file_path
+
+    def check_path_ability(self):
+        if self.ability == FileCheckConst.WRITE_ABLE:
+            check_path_writability(self.file_path)
+        if self.ability == FileCheckConst.READ_ABLE:
+            check_path_readability(self.file_path)
+        if self.ability == FileCheckConst.READ_WRITE_ABLE:
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+
+
+class FileOpen:
+    """
+    The class for open file by a safe way.
+
+    Attributes:
+        file_path: The file or dictionary path to be opened.
+        mode(str): The file open mode
+    """
+
+    SUPPORT_READ_MODE = ["r", "rb"]
+    SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"]
+    SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"]
+
+    def __init__(self, file_path, mode, encoding="utf-8"):
+        self.file_path = file_path
+        self.mode = mode
+        self.encoding = encoding
+        self._handle = None
+
+    def __enter__(self):
+        self.check_file_path()
+        binary_mode = "b"
+        if binary_mode not in self.mode:
+            self._handle = open(self.file_path, self.mode, encoding=self.encoding)
+        else:
+            self._handle = open(self.file_path, self.mode)
+        return self._handle
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._handle:
+            self._handle.close()
+
+    def check_file_path(self):
+        support_mode = (
+            self.SUPPORT_READ_MODE
+            + self.SUPPORT_WRITE_MODE
+            + self.SUPPORT_READ_WRITE_MODE
+        )
+        if self.mode not in support_mode:
+            print_error_log(f"File open not support {self.mode} mode")
+            raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        self.check_ability_and_owner()
+        check_path_pattern_vaild(self.file_path)
+        if os.path.exists(self.file_path):
+            check_common_file_size(self.file_path)
+
+    def check_ability_and_owner(self):
+        if self.mode in self.SUPPORT_READ_MODE:
+            check_path_exists(self.file_path)
+            check_path_readability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+
+
+def check_link(path):
+    abs_path = os.path.abspath(path)
+    if os.path.islink(abs_path):
+        print_error_log(f"The file path {path} is a soft link.")
+        raise FileCheckException(FileCheckException.SOFT_LINK_ERROR)
+
+
+def check_path_length(path):
+    if path_len_exceeds_limit(path):
+        print_error_log("The file path length exceeds limit.")
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
+
+
+def check_path_exists(path):
+    if not os.path.exists(path):
+        print_error_log(f"The file path {path} does not exist.")
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
+
+
+def check_path_readability(path):
+    if not os.access(path, os.R_OK):
+        print_error_log(f"The file path {path} is not readable.")
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_writability(path):
+    if not os.access(path, os.W_OK):
+        print_error_log(f"The file path {path} is not writable.")
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_executable(path):
+    if not os.access(path, os.X_OK):
+        print_error_log(f"The file path {path} is not executable.")
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_other_user_writable(path):
+    st = os.stat(path)
+    if st.st_mode & 0o002:
+        print_error_log(
+            f"The file path {path} may be insecure because other users have write permissions. "
+        )
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_owner_consistent(path):
+    file_owner = os.stat(path).st_uid
+    if file_owner != os.getuid():
+        print_error_log(
+            f"The file path {path} may be insecure because is does not belong to you."
+        )
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_pattern_vaild(path):
+    if not re.match(FileCheckConst.FILE_VALID_PATTERN, path):
+        print_error_log(f"The file path {path} contains special characters.")
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
+
+
+def check_file_size(file_path, max_size):
+    file_size = os.path.getsize(file_path)
+    if file_size >= max_size:
+        print_error_log(f"The size of file path {file_path} exceeds {max_size} bytes.")
+        raise FileCheckException(FileCheckException.FILE_TOO_LARGE_ERROR)
+
+
+def check_common_file_size(file_path):
+    if os.path.isfile(file_path):
+        for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items():
+            if file_path.endswith(suffix):
+                check_file_size(file_path, max_size)
+                break
+
+
+def check_file_suffix(file_path, file_suffix):
+    if file_suffix:
+        if not file_path.endswith(file_suffix):
+            print_error_log(f"The {file_path} should be a {file_suffix} file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
+
+
+def check_path_type(file_path, file_type):
+    if file_type == FileCheckConst.FILE:
+        if not os.path.isfile(file_path):
+            print_error_log(f"The {file_path} should be a file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
+    if file_type == FileCheckConst.DIR:
+        if not os.path.isdir(file_path):
+            print_error_log(f"The {file_path} should be a dictionary!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
+        
+
+def check_path_before_create(path):
+    if path_len_exceeds_limit(path):
+        raise FileCheckException(
+            FileCheckException.ILLEGAL_PATH_ERROR, "The file path length exceeds limit."
+        )
+
+    if not re.match(FileCheckConst.FILE_PATTERN, os.path.realpath(path)):
+        raise FileCheckException(
+            FileCheckException.ILLEGAL_PATH_ERROR,
+            f"The file path {path} contains special characters."
+        )
+
+
+def change_mode(path, mode):
+    if not os.path.exists(path) or os.path.islink(path):
+        return
+    try:
+        os.chmod(path, mode)
+    except PermissionError as ex:
+        raise FileCheckException(
+            FileCheckException.FILE_PERMISSION_ERROR,
+            f"Failed to change {path} authority. {str(ex)}",
+        ) from ex
+
+
+def path_len_exceeds_limit(file_path):
+    return (
+        len(os.path.realpath(file_path)) > FileCheckConst.DIRECTORY_LENGTH
+        or len(os.path.basename(file_path)) > FileCheckConst.FILE_NAME_LENGTH
+    )
diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/monitor/monitor/module_hook.py
similarity index 37%
rename from debug/accuracy_tools/kj600/kj600/module_hook.py
rename to debug/accuracy_tools/monitor/monitor/module_hook.py
index 3b600b2b7f28638d0df65305d933a765b4aa45d8..3b3e25ea3f22c212dcb36450961e45b46591a0b8 100644
--- a/debug/accuracy_tools/kj600/kj600/module_hook.py
+++ b/debug/accuracy_tools/monitor/monitor/module_hook.py
@@ -1,29 +1,54 @@
+import inspect
 import os
 import uuid
 import json
 from collections import defaultdict
+from functools import partial
+from copy import deepcopy
 from datetime import datetime
 import torch
+torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
+if not torch_version_above_or_equal_2:
+    raise ValueError("msmonitor require torch>=2.0")
+
 import torch.distributed as dist
+from torch.utils.hooks import BackwardHook
 from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook
-from kj600.module_spec_verifier import get_config, validate_config_spec
-from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon
-from kj600.features import eff_rank, get_sign_matches
-from kj600.visualizer import HeatmapVisualizer
-from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD
-from kj600.anomaly_inform import AnomalyInformFactory
-from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics
-from kj600.distributed.wrap_distributed import api_register, create_hooks,  op_aggregate
-from kj600.utils import print_warn_log, print_info_log, get_param_struct
+from monitor.module_spec_verifier import validate_config_spec
+from monitor.optimizer_collect import OptimizerMon, print_rank_0, OptimizerMonFactory
+from monitor.features import eff_rank, get_sign_matches
+from monitor.visualizer import HeatmapVisualizer
+from monitor.anomaly_detect import AnomalyScanner, AnomalyDataFactory, SummaryWriterWithAD, CSVWriterWithAD, BaseWriterWithAD
+from monitor.anomaly_analyse import AnomalyDataWriter
+from monitor.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics, squash_param_name, sqrt_norm_metric, reorder_metric
+from monitor.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, get_process_group
+from monitor.utils import print_warn_log, print_info_log, print_error_log, get_param_struct, validate_config, validate_ops
+from monitor.const import Const
+from monitor.file_check import FileOpen
+
+
+try:
+    import torch_npu
+except ImportError:
+    pass
+
+
+def param_is_not_tensor_parallel_duplicate(param, tp_group):
+    return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel) or (
+        torch.distributed.get_rank(group=tp_group) == 0
+    )
 
+def param_is_data_parallel_duplicate(dp_group):
+    return torch.distributed.get_rank(group=dp_group) != 0
 
 class ModuleHookContext:
     def __init__(self, module_name) -> None:
         self.step = 0
         self.micro_step = 0
-        self.actv = []
+        self.actv = defaultdict(dict)
         self.actvgrad = []
         self.module_name = module_name
+        self.struct = {}
         self.format_by_arg = {}
         self.verified = False
         self.focused_in_col = 0
@@ -31,13 +56,15 @@ class ModuleHookContext:
         self.ignore_in = False  # no need to care when no key 'input' or 'input_grad' found
 
     def set_format_by_arg(self, key_name:str, target_config:dict):
-        if key_name in target_config[self.module_name]:
-            self.format_by_arg[key_name] = target_config[self.module_name][key_name]
+        cared = target_config.get(self.module_name, self.struct)
+        if key_name in cared:
+            if isinstance(cared[key_name], dict):  # cared = self.struct
+                config = cared[key_name].get('config')
+                self.format_by_arg[key_name] = config
+            else:  # cared = target_config[self.module_name]
+                self.format_by_arg[key_name] = cared[key_name]
         elif key_name in ['input', 'input_grad']:
             self.ignore_in = True
-        else:
-            raise KeyError(f"Missing key: {key_name} of {self.module_name} in config.json")
-
 
 class OptimizerContext:
     def __init__(self) -> None:
@@ -48,7 +75,9 @@ class OptimizerContext:
         self.param_adam_ratio = defaultdict()
         self.param_weight_grad = defaultdict()
         self.param_exp_avg = defaultdict()
+        self.exp_avg_metric = []
         self.param_exp_avg_sq = defaultdict()
+        self.exp_avg_sq_metric = []
         self.metric_list = []
 
 
@@ -71,30 +100,55 @@ class CommunicationContext:
     def aggregate(self):
         self.data = self._agg(self.data)
 
+class GradContext:
+    def __init__(self) -> None:
+        self.pre = []
+        self.post = []
+        self.acc_metric = []
+        self.acc = {}
+        self.actv = {}
+
+    def reset(self):
+        self.pre.clear()
+        self.post.clear()
+        self.acc_metric.clear()
+        self.acc.clear()
+        self.actv.clear()
+            
 class TrainerMon:
 
     tensor_metrics = TensorMetrics()
 
-    # opt_ty: "Megatron_Float16OptimizerWithFloat16Params" or "Megatron_DistributedOptimizer"
-    def __init__(self, config_file_path, params_have_main_grad=True, opt_ty=None) -> None:
+    def __init__(self, config_file_path, process_group=None, params_have_main_grad=True, opt_ty=None) -> None:
+        """
+        opt_ty: "Megatron_Float16OptimizerWithFloat16Params" or "Megatron_DistributedOptimizer"
+        """
         self.module_fwd_hook_context_by_module = defaultdict(ModuleHookContext)
         self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext)
         self.optimizer_context = defaultdict(OptimizerContext)
         self.cc_context = defaultdict(CommunicationContext)
+        self.grad_context = GradContext()
+        self.process_group = get_process_group(process_group)
         self.params_have_main_grad = params_have_main_grad
-        self.config = get_config(config_file_path)
+        self.opt_ty = opt_ty
+        with FileOpen(config_file_path, 'r') as f:
+            self.config = json.load(f)
+        validate_config(self.config)
         self.module_rank_list = self.config.get("module_ranks", [])
+        self.format = self.config.get('format', 'tensorboard')
         self.eps = self.config.get('eps', 1e-8)
         self.ops = self.config.get('ops', [])
+        self.ndigits = self.config.get('ndigits', 6)
+        self.all_xy = self.config.get('all_xy', False)
         self.xy_distribution = self.config.get('xy_distribution', False)
         if not self.xy_distribution:
             print_rank_0("> module input/output input_grad/output_grad is not monitored. ")
-        
         # backward hook cause megatron-lm pipeline parallel schedule assert exception. 
         # TBD: backward hook cause output tensor is view of some base tensor. root cause invesigation pending.
         self.forward_only = self.config.get('forward_only', False) 
         if self.forward_only: 
             print_rank_0("> only module forward is monitored. ")
+        self.backward_only = self.config.get('backward_only', False) 
 
         self.ur_distribution = self.config.get('ur_distribution', False)
         if not self.ur_distribution:
@@ -121,27 +175,74 @@ class TrainerMon:
             api_register.redirect_api()
 
         alert_setting = self.config.get('alert', {"rules":[]})
-        self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"])
-        
-        anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None
+        self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"])        
         
-        self.optimizer_hooked = False
-        output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output')
+        output_base_dir = os.getenv('MONITOR_OUTPUT_DIR', './monitor_output')
         cur_time = datetime.now().strftime('%b%d_%H-%M-%S')
         unique_id = str(uuid.uuid4())[:8]
+        
         if dist.is_initialized():
-            if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0:
-                self.summary_writer = SummaryWriterWithAD(
-                    os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform)
+            rank = dist.get_rank()
+            tensorboard_dir = os.path.join(output_base_dir, f"{cur_time}-rank{rank}-{unique_id}")
+            pp_stage = dist.get_group_rank(self.process_group, rank)
+            group_mates = dist.get_process_group_ranks(self.process_group)
         else:
-            self.summary_writer = SummaryWriterWithAD(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform)
+            rank = 0
+            tensorboard_dir = os.path.join(output_base_dir, f"{cur_time}-{unique_id}")
+            pp_stage = 0
+            group_mates = [0]
+        self.rank = rank
+
+        # 初始化AnomalyData工厂
+        self.anomaly_data_factory = AnomalyDataFactory(rank, pp_stage, group_mates) if alert_setting.get('dump', False) else None
+
+        if self.format == 'tensorboard':
+            writer = SummaryWriterWithAD
+            self.write_metrics = write_metrics_tensorboard
+        elif self.format == 'csv':
+            writer = CSVWriterWithAD
+            self.write_metrics = write_metrics_csv
+        elif self.format == 'api':
+            writer = BaseWriterWithAD
+            self.write_metrics = write_metrics_tensorboard
+
+        if (rank in self.module_rank_list) or len(self.module_rank_list) == 0:
+            
+            self.summary_writer = writer(
+                tensorboard_dir, 
+                self.alert_rules, 
+                unique_id, 
+                None,
+                self.anomaly_data_factory,
+                self.ndigits
+                )
+            # 初始化anomaly deteted文件目录
+            if self.anomaly_data_factory:
+                self.anomaly_data_writer = AnomalyDataWriter(
+                    os.path.join(output_base_dir, "anomaly_detected"), rank)
+                self.anomaly_data_writer.init_detected_json()
+
         # A HeatmapVisualizer instance is associated with an image
         self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer)
         self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer)
-        self.micro_batch_number = 0
+        self.micro_batch_number = 1
+
+        self.model = None
+        self.weight_hooked = False
+        self.optimizer_hooked = False
+        self.param_registered = False
+        self.vpp = False
+        self.dp_group = None
+        self.tp_group = None
 
-        self.param_name_list = []
         self.param2name = defaultdict(str)
+        self.name2index = defaultdict()
+        self.name2indices = defaultdict()
+        self.name2param = {}
+        self.param_name_call_id = {}
+        self.call_id = 0
+        self.grad_accs = []
+        self.handles = defaultdict(list)
 
         self.mix_precision_optimizer_mon = OptimizerMonFactory.create_optimizer_mon(opt_ty)
         if opt_ty is None:
@@ -149,18 +250,30 @@ class TrainerMon:
                 raise Exception("ur_distribution cannot be enabled with unknown optimizer.")
             if self.mv_distribution:
                 raise Exception("mv_distribution cannot be enabled with unknown optimizer.")
+        self.verbose = False
         self.print_struct = self.config.get("print_struct", False)
+        if self.print_struct:
+            self.verbose = True
         self.struct_printed = False
         self.module_struct = {}
+        
         return
 
     def __del__(self):
         if hasattr(self, "summary_writer"):
             self.summary_writer.close()
+
+    @property
+    def ops(self):
+        return self._ops
+
+    @ops.setter
+    def ops(self, value):
+        self._ops = validate_ops(value)
     
     @staticmethod
     def set_wrapped_optimizer(_wrapped_optimizer):
-        MixPrecsionOptimizerMon.set_wrapped_optimizer(_wrapped_optimizer)
+        OptimizerMon.set_wrapped_optimizer(_wrapped_optimizer)
 
     @staticmethod
     def adhoc_check(target_tensor:torch.tensor, module_name:str, tensor_name:str, rank_list, ops_list):
@@ -171,60 +284,164 @@ class TrainerMon:
                 return
         TrainerMon.tensor_metrics.stat_insert(target_tensor, ops_list, module_name, tensor_name, rank)
 
+    @staticmethod
+    def build_tbtag_tensor_map(module_name, tag, tensor):
+        metrics = {}
+        rank = dist.get_rank() if dist.is_initialized() else None
+        key = get_summary_writer_tag_name(module_name, tag, rank)
+        if torch.is_tensor(tensor):
+            metrics[key] = tensor
+        return metrics
+
+    @staticmethod
+    def generate_cc_metrics(cc_name, cc_tensor):
+        metrics = defaultdict(dict)
+        rank = dist.get_rank() if dist.is_initialized() else None
+        for op, tag2tensor in cc_tensor.data.items():
+            for tag, tensor in tag2tensor.items():
+                key = get_summary_writer_tag_name(cc_name, tag, rank)
+                metrics[op].update({key: tensor})
+        cc_tensor.reset()
+        return metrics
+
     def hook_modules(self, model:torch.nn.Module, grad_acc_steps):
-        # fwd=0, bkd=1
-        # targets is module name list like ["xx.xxx1", "xxx.xxx2"] which can be obtained when first run. 
-        print_rank_0("> module names:")
-        for name, _ in model.named_modules():
-            print_rank_0(f"\t{name}")
+        if self.module_rank_list and (self.rank not in self.module_rank_list):
+            return
+        
+        if not isinstance(model, list):
+            model = [model]
+        self.model = model
+        self._register_param_name(model)
+
         self.micro_batch_number = grad_acc_steps
 
-        if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list):
-            targets = [x for x, _ in model.named_modules()] if self.print_struct else self.config['targets'].keys()
-            hooked_count = self._hook_module(targets, model, fwd_or_bkd=0)
-            print_rank_0(f"> {hooked_count} out of {len(self.config['targets'])} are monitored.")
-        else:
-            return
+        targets = self.config['targets']
+        module_in_all_stage = [key for key in targets.keys() if Const.VPP_SEP not in key]
+        for key in module_in_all_stage:
+            struct = targets.pop(key)
+            targets.update({f'{vpp_stage}{Const.VPP_SEP}{key}':struct for vpp_stage in range(len(model))})
+
+        hooked_count = 0
+        for vpp_stage, model_chunk in enumerate(model):
+            vpp_stage = f'{vpp_stage}{Const.VPP_SEP}'
+            targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config['targets'].keys()
+            hooked_count += self._hook_module(targets, model_chunk, vpp_stage)
+        
+        print_rank_0(f"> {hooked_count} out of {len(self.config['targets'])} are monitored.")
+
+        def clone_if_tensor(args):
+                if isinstance(args, tuple):
+                    return tuple([clone_if_tensor(arg) for arg in args])
+                elif isinstance(args, torch.Tensor):
+                    return args.clone()
+                else:
+                    return args
+
+        @torch.no_grad
+        def wrap_hook_setup(setup):
+            def wrapped_setup(*args, **kwargs):
+                args = setup(*args, **kwargs)
+                args = clone_if_tensor(args)
+                return args
+
+            return wrapped_setup
+
+        BackwardHook.setup_output_hook = wrap_hook_setup(BackwardHook.setup_output_hook)
 
         if not self.optimizer_hooked:
-            self.optimizer_hooked = True
-            print_rank_0("> parameter names:")
-            for name, param in model.named_parameters():
-                print_rank_0(f"\t{name}")
-                for target_module, _ in self.config['targets'].items():
-                    if name.startswith(target_module): # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0
-                        self.param_name_list.append(name)
-                        self.param2name[param] = name
             self.hook_optimizer()
         return
 
-    def build_tbtag_tensor_map(self, module_name, tag, tensor):
-        metrics = {}
-        rank = dist.get_rank() if dist.is_initialized() else None
-        key = get_summary_writer_tag_name(module_name, tag, rank)
-        if tensor is not None:
-            metrics[key] = tensor
-        return metrics
+    def generate_mv_metrics(self, opt_context):
+        if not self.mv_distribution:
+            return
+        opt_context.exp_avg_metric = {}
+        opt_context.exp_avg_sq_metric = {}
+        m_tag_tensor_map = self.generate_param_metrics('exp_avg', opt_context.param_exp_avg)
+        v_tag_tensor_map = self.generate_param_metrics('exp_avg_sq', opt_context.param_exp_avg_sq)
+        for metric_name in self.ops:
+            opt_context.exp_avg_metric[metric_name] = get_metrics(metric_name, m_tag_tensor_map, self.eps)
+            opt_context.exp_avg_sq_metric[metric_name] = get_metrics(metric_name, v_tag_tensor_map, self.eps)
+
+    def generate_wgrad_metrics(self):
+        if not self.wg_distribution:
+            return {}, {}
+        
+        unreduced = {}
+        if self.weight_hooked:
+            for metric_name in self.ops:
+                unreduced[metric_name] = get_metrics(metric_name, self.grad_context.acc, self.eps)
+            self.grad_context.acc_metric = [unreduced.copy()]
+        sqrt_norm_metric(unreduced)
+        unreduced = reorder_metric(unreduced)
+
+        grad_dict = {}
+        for param, name in self.param2name.items():
+            if self.tp_group and not param_is_not_tensor_parallel_duplicate(param, self.tp_group):
+                continue
+            if self.dp_group and param_is_data_parallel_duplicate(self.dp_group):
+                continue    
+            grad = param.main_grad if self.params_have_main_grad else param.grad
+            if grad is None:
+                print_warn_log(f"grad is None: {name}, maybe something wrong happened.")
+                continue
+            key = get_summary_writer_tag_name(name, 'post_grad', self.rank)
+            grad_dict[key] = grad
+
+        reduced = {op: get_metrics(op, grad_dict, self.eps) for op in self.ops}
+        self.grad_context.post = [reduced.copy()]
+        sqrt_norm_metric(reduced)
+        reduced = reorder_metric(reduced)
+
+        return reduced, unreduced
+
+    def monitor_gnorm_with_ad(self, model, grad_acc_steps=1, optimizer=None, tp_group=None, dp_group=None):
+        print_info_log(f'grad acc steps {grad_acc_steps}')
+        self.hook_optimizer(optimizer)
+        self.micro_batch_number = grad_acc_steps
+
+        self.dp_group = dp_group
+        self.tp_group = tp_group
+
+        self._register_param_name(model)
+        self._hook_weights()
+        self.hook_modules(model, grad_acc_steps)
 
     def generate_param_metrics(self, tag, param_tensor):
         metrics = {}
         rank = dist.get_rank() if dist.is_initialized() else None
-        for param, name in self.param2name.items():
+        for name in self.param2name.values():
             key = get_summary_writer_tag_name(name, tag, rank)
             if name not in param_tensor or param_tensor[name] is None:
                 continue
             metrics[key] = param_tensor[name]
         return metrics
-    
-    def generate_cc_metrics(self, cc_name, cc_tensor):
-        metrics = defaultdict(dict)
-        rank = dist.get_rank() if dist.is_initialized() else None
-        for op, tag2tensor in cc_tensor.data.items():
-            for tag, tensor in tag2tensor.items():
-                key = get_summary_writer_tag_name(cc_name, tag, rank)
-                metrics[op].update({key: tensor})
-        cc_tensor.reset()
-        return metrics
+
+    def generate_xy_metrics(self):
+        actv = {}
+        for fwd_context in self.module_fwd_hook_context_by_module.values():
+            for op in self.ops:
+                if op not in actv:
+                    actv[op] = {}
+                actv[op].update(fwd_context.actv[op])
+        sqrt_norm_metric(actv)
+        actv = reorder_metric(actv)
+        
+        actv_grad = deepcopy(self.grad_context.actv)
+        sqrt_norm_metric(actv_grad)
+        actv_grad = reorder_metric(actv_grad)
+
+        return actv, actv_grad
+
+    def reload_xy(self, xy_distribution=False):
+        self.xy_distribution = xy_distribution
+
+        for handle in self.handles['xy']:
+            handle.remove()
+        self.handles['xy'].clear()
+        self.hook_modules(self.model, self.micro_batch_number)
+        for _, fwd_context in self.module_fwd_hook_context_by_module.items():
+            fwd_context.actv.clear()
 
     def write_adhoc_check(self, step):
         TrainerMon.tensor_metrics.flush(self.summary_writer)
@@ -233,64 +450,71 @@ class TrainerMon:
         if not self.xy_distribution:
             return
         for _, fwd_context in self.module_fwd_hook_context_by_module.items():
-            if not len(fwd_context.actv) == self.micro_batch_number:
-                print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}")
-            for metric_name in self.ops:
-                write_metrics_tensorboard(metric_name, self.summary_writer, fwd_context.actv, step)
+            if len(fwd_context.actv) == 0:
+                continue
+            self.write_metrics(self.ops, self.summary_writer, [fwd_context.actv], step, 'actv')
             fwd_context.actv.clear()
+        if self.grad_context.actv:
+            self.write_metrics(self.ops, self.summary_writer, [self.grad_context.actv], step, 'actv_grad')
 
-        for _, bwd_context in self.module_bwd_hook_context_by_module.items():
-            if not len(bwd_context.actvgrad) == self.micro_batch_number:
-                print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}")
-            for metric_name in self.ops:
-                write_metrics_tensorboard(metric_name, self.summary_writer, bwd_context.actvgrad, step)
-            bwd_context.actvgrad.clear()
+    def write_mv_tb(self, opt_context):
+        if not self.mv_distribution:
+            return
+        self.write_metrics(self.ops, self.summary_writer, [opt_context.exp_avg_metric], opt_context.step, 'exp_avg')
+        self.write_metrics(self.ops, self.summary_writer, [opt_context.exp_avg_sq_metric], opt_context.step, 'exp_avg_sq')
+
+    def write_grad_tb(self, step):
+        if not self.wg_distribution:
+            return
 
-    def hook_optimizer(self):
+        self.write_metrics(self.ops, self.summary_writer, self.grad_context.post, step, 'grad_reduced')
+        self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_unreduced')
+        
+    def hook_optimizer(self, optimizer=None):
         # in DDP by default use params_have_main_grad
         def optimizer_pre_step_hook(optimizer, args, kwargs):
             context = self.optimizer_context[optimizer]
+            if self.opt_ty in Const.DEEPSPEED_OPT_TY:
+                if context.step == 0:
+                    return
+                elif context.step == 1:
+                    self.name2indices = self.mix_precision_optimizer_mon.get_param_index(self.param2name, self.name2index)
+                mv_result = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name, self.name2indices)
+                self.param2name = mv_result.grad
+            else:
+                mv_result = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name)
+            context.param_exp_avg = mv_result.exp_avg
+            context.param_exp_avg_sq = mv_result.exp_avg_sq
+            context.param_adam_update = mv_result.update
+            context.param_adam_ratio = mv_result.ratio
+
             if self.print_struct and not all(value == {} for value in self.module_struct.values()) and not self.struct_printed:
                 self._smallest_rank_print("> module struct:")
                 self._smallest_rank_print(json.dumps(self.module_struct, indent=4))
-                self.struct_printed = True
                 if not self.cc_log_only:
                     raise Exception("exit after first step when print model struct")
             if self.cc_log_only and context.step > 0:
                 self._smallest_rank_print("> Used communication ops and corresponding stack")
                 self._smallest_rank_print(json.dumps({k:[i.split(';') for i in v] for k,v in self.cc_logged_stack.items()}, indent=4))
                 raise Exception("exit after first step when print cc stack")
-            
 
-            context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self,
-                optimizer, self.param2name)
-            
-            for param, name in self.param2name.items():
-                if "params_effrank" in self.config and name in self.config["params_effrank"]:
-                    context.param_effective_rank[name] = eff_rank(param.detach())
-                grad = param.main_grad if self.params_have_main_grad else param.grad
-                if grad is None:
-                    print_warn_log(f"grad is None: {name}, maybe something wrong happened.")
-                    continue
-                if self.wg_distribution:
-                    context.param_weight_grad[name] = grad
-                if self.mg_direction: 
+            self.generate_wgrad_metrics()
+            self.generate_mv_metrics(context)
+
+            tbtag_tensor_map = {}
+            if self.mg_direction:
+                for param, name in self.param2name.items():
+                    grad = param.main_grad if self.params_have_main_grad else param.grad
+                    if grad is None:
+                        print_warn_log(f"grad is None: {name}, maybe something wrong happened.")
+                        continue
                     if context.step == 0:
                         same_direction_ratio = torch.tensor(1.)
                     else:
                         same_direction_ratio = get_sign_matches(grad, context.param_exp_avg[name])
                     context.param_mg_direction[name] = same_direction_ratio
-
-            tbtag_tensor_map = {}
-            if self.wg_distribution:
-                tbtag_tensor_map.update(self.generate_param_metrics('weight_grad', context.param_weight_grad))
-            if self.mv_distribution:
-                tbtag_tensor_map.update(self.generate_param_metrics('exp_avg', context.param_exp_avg))
-                tbtag_tensor_map.update(self.generate_param_metrics('exp_avg_sq', context.param_exp_avg_sq))
-            if self.mg_direction:
                 tbtag_tensor_map.update(self.generate_param_metrics('mg_direction', context.param_mg_direction))
-            # if not tbtag_tensor_map:
-            #     return
+
             metric_dict = {}
             for metric_name in self.ops:
                 metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps)
@@ -299,6 +523,7 @@ class TrainerMon:
                 cc_metrics = self.generate_cc_metrics(k, c)
                 for op, m in cc_metrics.items():
                     metric_dict[op].update(m)
+            
             if not metric_dict:
                 return
             context.metric_list.append(metric_dict)
@@ -306,9 +531,16 @@ class TrainerMon:
 
         def optimizer_post_step_hook(optimizer, args, kwargs):
             context = self.optimizer_context[optimizer]
+            if (self.opt_ty in Const.DEEPSPEED_OPT_TY and context.step == 0):
+                context.step += 1
+                return
             rank = dist.get_rank() if dist.is_initialized() else None
 
+            if self.anomaly_data_factory:
+                self.anomaly_data_factory.set_call_id(self.param_name_call_id)
             self.write_xy_tb(context.step)
+            self.write_grad_tb(context.step)
+            self.write_mv_tb(context)
             self.write_adhoc_check(context.step)
 
             if self.ur_distribution:
@@ -317,20 +549,43 @@ class TrainerMon:
                 for param_name, _ in context.param_adam_ratio.items():
                     self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer)
 
-            for metric_name in self.ops:
-                if not context.metric_list:
-                    break
-                write_metrics_tensorboard(metric_name, self.summary_writer, context.metric_list, context.step)
+            if context.metric_list:
+                self.write_metrics(self.ops, self.summary_writer, context.metric_list, context.step, 'other')
             context.metric_list.clear()
             context.step += 1
+            self.grad_context.reset()
+            if self.anomaly_data_factory:
+                self.anomaly_data_writer.write_detected_json(self.summary_writer.get_anomalies())
+            self.summary_writer.clear_anomalies()
+            self.call_id = 0
+            self.param_name_call_id.clear()
+            return
+        
+        def patch_step(func, optimizer):
+            def wrapper(*args, **kwargs):
+                optimizer_pre_step_hook(optimizer, args, kwargs)
+                out = func(*args, **kwargs)
+                optimizer_post_step_hook(optimizer, args, kwargs)
+                return out
 
+            return wrapper
+
+        if self.optimizer_hooked:
             return
-        if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list):
-            register_optimizer_step_pre_hook(optimizer_pre_step_hook)
-            register_optimizer_step_post_hook(optimizer_post_step_hook)
+
+        if optimizer:
+            optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
+
+        else:
+            if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list):
+                register_optimizer_step_pre_hook(optimizer_pre_step_hook)
+                register_optimizer_step_post_hook(optimizer_post_step_hook)
+        self.optimizer_hooked = True
         return
 
     def _smallest_rank_print(self, msg):
+        if not self.verbose:
+            return
         if dist.is_initialized():
             if self.module_rank_list:
                 if dist.get_rank() == min(self.module_rank_list):
@@ -340,44 +595,139 @@ class TrainerMon:
                     print_info_log(msg)
         else:
             print_info_log(msg)
+    
+    def _is_target_param(self, param_name, param, prefix):
+        squash_name = prefix + squash_param_name(param_name)
+        name = prefix + param_name
+        for target in self.config['targets'].keys():
+            if param_name.startswith(target) or squash_name.startswith(target) or name.startswith(target):
+                setattr(param, "zero_out_wgrad", True)
+                return True
+        
+        return False
 
-    def _hook_module(self, target_names, module: torch.nn.Module, fwd_or_bkd):
+    def _register_chunk(self, model_chunk, prefix):
+        for index, (param_name, param) in enumerate(model_chunk.named_parameters()):
+            if not param.requires_grad:
+                continue
+            if self._is_target_param(param_name, param, prefix):
+                name = prefix + squash_param_name(param_name)
+                if name in self.param2name.values():
+                    print_error_log(f'same name {name} for different param. Current param is {param_name}. \
+                                    May be error of squash_param_name')
+                    raise Exception("param with same name will be overwritten.")
+                self.param2name[param] = name
+                self.name2param[name] = param
+                self.name2index[name] = index
+        
+    def _register_param_name(self, model):
+        if self.param_registered:
+            return
+    
+        if not isinstance(model, list):
+            model = [model]
+
+        if len(model) > 1:
+            self.vpp = True
+            self._smallest_rank_print('vpp enabled')
+
+        for vpp_stage, model_chunk in enumerate(model):
+            prefix = f'{vpp_stage}{Const.VPP_SEP}'
+            self._register_chunk(model_chunk, prefix)
+                
+        self.param_registered = True
+
+    def _is_target_module(self, module_name, targets, vpp_stage):
+        if self.all_xy or self.print_struct:
+            return vpp_stage + squash_param_name(module_name)
+        for pattern in [
+            vpp_stage + squash_param_name(module_name),
+            vpp_stage + module_name,
+        ]:
+            if pattern in targets:
+                return pattern
+        return ""
+
+
+    def _hook_module(self, target_names, module: torch.nn.Module, vpp_stage=''):
         if '_modules' not in module.__dict__:
             # nothing to hook
             return 0
 
+        def _is_recomputation():
+            """Check if the current operation is in the recomputation phase.
+
+            This function inspects the current call stack to indicate whether the current operation is in the
+            recomputation phase. We use a blacklist mechanism, now supported megatron and mindspeed framework.
+            megatron: The 'backward' function is called by the 'torch/autograd/function.py' file.
+            mindspeed: The 'checkpoint_function_backward' function is called by the 'torch/autograd/function.py'
+            file or the custom module(use CheckpointWithoutOutput) with the 'backward' function is executed within the
+            'torch/_tensor.py' file.
+
+            Returns:
+                bool: True if in the recomputation phase, False otherwise.
+            """
+            backward_function_indices = []
+            call_stack = inspect.stack()
+
+            # Identify the function 'backward' is being executed within the 'torch/_tensor.py' file.
+            for frame_info in call_stack:
+                if frame_info.function == 'backward' and frame_info.filename.endswith('torch/_tensor.py'):
+                    del call_stack
+                    return True
+
+            # Identify indices in the call stack where the specific function is being executed
+            for idx, frame_info in enumerate(call_stack):
+                if frame_info.function == 'backward' or frame_info.function == 'checkpoint_function_backward':
+                    backward_function_indices.append(idx)
+
+            # Check if the execution is within 'torch/autograd/function.py' file
+            for idx in backward_function_indices:
+                if idx + 1 < len(call_stack) and call_stack[idx + 1].filename.endswith('torch/autograd/function.py'):
+                    del call_stack
+                    return True
+
+            del call_stack
+            return False
+
         def fwd_hook_fun(module, module_input, module_output):
+            if _is_recomputation():
+                return
             context: ModuleHookContext = self.module_fwd_hook_context_by_module[module]
+            if not context.struct:
+                context.struct = {Const.ACTV_IN: get_param_struct(module_input), Const.ACTV_OUT: get_param_struct(module_output)}
             if self.print_struct:
-                self.module_struct[context.module_name].update(
-                    {"input": f"{get_param_struct(module_input)}", "output": f"{get_param_struct(module_output)}"})
+                if context.module_name not in self.module_struct:
+                    self.module_struct[context.module_name] = {}
+                self.module_struct[context.module_name].update(context.struct)
                 return
-            if not self.xy_distribution:
+            if not module.training:
                 return
             if not context.format_by_arg:
-                context.set_format_by_arg('input', self.config['targets'])
-                context.set_format_by_arg('output', self.config['targets'])
+                context.set_format_by_arg(Const.ACTV_IN, self.config['targets'])
+                context.set_format_by_arg(Const.ACTV_OUT, self.config['targets'])
+            if not context.format_by_arg:
+                return
             if not context.verified:
                 if not context.ignore_in:
-                    context.focused_in_col = validate_config_spec(context.format_by_arg['input'], module_input, context.module_name, 'input')
-                context.focused_out_col = validate_config_spec(context.format_by_arg['output'], module_output, context.module_name, 'output')
+                    context.focused_in_col = validate_config_spec(context.format_by_arg[Const.ACTV_IN], module_input, context.module_name, Const.ACTV_IN)
+                context.focused_out_col = validate_config_spec(context.format_by_arg[Const.ACTV_OUT], module_output, context.module_name, Const.ACTV_OUT)
                 context.verified = True
             # expect output be tensor type
             tbtag_tensor_map = {}
             if not context.ignore_in:
                 cared_input = module_input if context.focused_in_col is None else module_input[context.focused_in_col]
-                tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'input', cared_input))
+                tbtag_tensor_map.update(self.build_tbtag_tensor_map(f'{context.module_name}_{context.micro_step}', Const.ACTV_IN, cared_input))
             cared_output = module_output if context.focused_out_col is None else module_output[context.focused_out_col]
-            tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'output', cared_output))
-            metric_dict = {}
-            for metric_name in self.ops:
-                metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps)
-            if context.micro_step == 0 and context.actv:
-                print_warn_log(
-                    f"actv context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.")
-                context.actv.clear()
-            context.actv.append(metric_dict)
+            tbtag_tensor_map.update(self.build_tbtag_tensor_map(f'{context.module_name}_{context.micro_step}', Const.ACTV_OUT, cared_output))
 
+            for metric_name in self.ops:
+                if context.micro_step == 0 and context.actv.get(metric_name, []):
+                    print_warn_log(
+                        f"actv context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.")
+                    context.actv.clear()
+                context.actv[metric_name].update(get_metrics(metric_name, tbtag_tensor_map, self.eps))
+                
             context.micro_step += 1
             if context.micro_step == self.micro_batch_number:
                 context.micro_step = 0
@@ -386,34 +736,39 @@ class TrainerMon:
 
         def bwd_hook_fun(module, input_grad, output_grad):
             context: ModuleHookContext = self.module_bwd_hook_context_by_module[module]
+            if not context.struct:
+                context.struct = {Const.ACTVGRAD_IN: get_param_struct(input_grad), Const.ACTVGRAD_OUT: get_param_struct(output_grad)}
             if self.print_struct:
-                self.module_struct[context.module_name].update(
-                    {"input_grad": f"{get_param_struct(input_grad)}", "output_grad": f"{get_param_struct(output_grad)}"})
-                return
-            if not self.xy_distribution:
+                if context.module_name not in self.module_struct:
+                    self.module_struct[context.module_name] = {}
+                self.module_struct[context.module_name].update(context.struct)
                 return
             if not context.format_by_arg:
-                context.set_format_by_arg('input_grad', self.config['targets'])
-                context.set_format_by_arg('output_grad', self.config['targets'])
+                context.set_format_by_arg(Const.ACTVGRAD_IN, self.config['targets'])
+                context.set_format_by_arg(Const.ACTVGRAD_OUT, self.config['targets'])
+            if not context.format_by_arg:
+                return
             if not context.verified:
                 if not context.ignore_in:
-                    context.focused_in_col = validate_config_spec(context.format_by_arg['input_grad'], input_grad, context.module_name, 'input_grad')
-                context.focused_out_col = validate_config_spec(context.format_by_arg['output_grad'], output_grad, context.module_name, 'output_grad')
+                    context.focused_in_col = validate_config_spec(context.format_by_arg[Const.ACTVGRAD_IN], input_grad, context.module_name, Const.ACTVGRAD_IN)
+                context.focused_out_col = validate_config_spec(context.format_by_arg[Const.ACTVGRAD_OUT], output_grad, context.module_name, Const.ACTVGRAD_OUT)
                 context.verified = True
 
             tbtag_tensor_map = {}
             if not context.ignore_in:
                 cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col]
-                tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'input_grad', cared_input_grad))
+                tbtag_tensor_map.update(self.build_tbtag_tensor_map(f'{context.module_name}_{context.micro_step}', Const.ACTVGRAD_IN, cared_input_grad))
             cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col]
-            tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'output_grad', cared_output_grad))
-            metric_dict = {}
-            for metric_name in self.ops:
-                metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps)
+            tbtag_tensor_map.update(self.build_tbtag_tensor_map(f'{context.module_name}_{context.micro_step}', Const.ACTVGRAD_OUT, cared_output_grad))
+
             if context.micro_step == 0 and context.actvgrad:
                 print_warn_log(f"actvgrad context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.")
                 context.actvgrad.clear()
-            context.actvgrad.append(metric_dict)
+
+            for metric_name in self.ops:
+                if metric_name not in self.grad_context.actv:
+                    self.grad_context.actv[metric_name] = {}
+                self.grad_context.actv[metric_name].update(get_metrics(metric_name, tbtag_tensor_map, self.eps))
 
             context.micro_step += 1
             if context.micro_step == self.micro_batch_number:
@@ -421,15 +776,49 @@ class TrainerMon:
                 context.step += 1
             return
 
-        hooked_count = 0
-        for name, submodule in module.named_modules():
-            self.module_struct[name] = {}
-            if name in target_names:
-                submodule.register_forward_hook(fwd_hook_fun)
-                self.module_fwd_hook_context_by_module[submodule] = ModuleHookContext(name)
+        if self.backward_only and self.forward_only:
+            print_warn_log('not enable backward_only and forward_only simultaneously')
+
+        hooked_count = 0 
+        if self.xy_distribution or self.print_struct:
+            for module_name, submodule in module.named_modules():
+                name = self._is_target_module(module_name, target_names, vpp_stage)
+                if not name:
+                    continue
+                if not self.backward_only:
+                    handle = submodule.register_forward_hook(fwd_hook_fun)
+                    self.handles['xy'].append(handle)
+                    self.module_fwd_hook_context_by_module[submodule] = ModuleHookContext(name)
                 if not self.forward_only:
-                    submodule.register_full_backward_hook(bwd_hook_fun)
+                    handle = submodule.register_full_backward_hook(bwd_hook_fun)
+                    self.handles['xy'].append(handle)
                     self.module_bwd_hook_context_by_module[submodule] = ModuleHookContext(name)
                 print_rank_0(f"> {name} is monitored successfully")
                 hooked_count += 1
         return hooked_count
+
+    def _hook_weights(self):
+        context = self.grad_context
+
+        @torch.no_grad
+        def param_hook(*args, context_dict, param, key, name):
+            param.micro_step += 1
+            self.param_name_call_id[name] = self.call_id
+            self.call_id += 1
+            if param.micro_step == self.micro_batch_number:
+                param.micro_step = 0
+                if self.params_have_main_grad:
+                    context_dict[key] = param.main_grad.clone()
+                else:
+                    context_dict[key] = param.grad.clone()
+        
+        for param, name in self.param2name.items():
+            key = get_summary_writer_tag_name(name, 'acc_grad', self.rank)
+            setattr(param, 'micro_step', 0)
+            param_tmp = param.expand_as(param)
+            grad_acc = param_tmp.grad_fn.next_functions[0][0]
+            handle = grad_acc.register_hook(partial(param_hook, context_dict=context.acc, param=param, key=key, name=name))
+            self.grad_accs.append(grad_acc)
+            self.handles['wgrads'].append(handle)
+
+        self.weight_hooked = True
diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/monitor/monitor/module_metric.py
similarity index 58%
rename from debug/accuracy_tools/kj600/kj600/module_metric.py
rename to debug/accuracy_tools/monitor/monitor/module_metric.py
index e09536b072cf7953e6b6106420936416d4264d0e..3c99afed187ea99dcaca25caed710091f1230d58 100644
--- a/debug/accuracy_tools/kj600/kj600/module_metric.py
+++ b/debug/accuracy_tools/monitor/monitor/module_metric.py
@@ -1,15 +1,30 @@
 import math
+import re
 import statistics
+import itertools
+import torch
 
-from kj600.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm
+from monitor.const import Const
+from monitor.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm, get_mean
+from monitor.utils import print_warn_log
 
 
 def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank):
     if rank is None:
         return f"{module_or_param_name}/{tag}"
     else:
-        return f"{module_or_param_name}/{rank}/{tag}"
-
+        return f"{module_or_param_name}/rank{rank}/{tag}"
+
+def squash_param_name(param_name):
+    name = ''
+    for pattern in ['layers?\.(.*)', 'embeddings?\.(.*)', 'final.*', 'output.*','norm.*']:
+        match = re.findall(pattern, param_name)
+        if match:
+            name += match[0]
+            break
+    if name == '':
+        name = param_name
+    return name
 
 # 用于存储所有metric实现类的注册表
 config_metric_registry = {}
@@ -28,7 +43,7 @@ class TensorMetrics:
         self.metrics = {} #tensor_tag --> []
         self.cur_idx = {}
 
-    fun_map = {"norm": get_norm, "max": get_max, "min": get_min}
+    fun_map = {"norm": get_norm, "max": get_max, "min": get_min, "mean": get_mean}
     #get stats and insert into metrics dictionary
     def stat_insert(self, tensor, stat_ops, module_name, tensor_name, rank, eps=1e-8):
         prefix = get_summary_writer_tag_name(module_name, tensor_name, rank)
@@ -59,7 +74,12 @@ class Metric(object):
     def get_metrics(self, tag2tensor: dict, eps):
         metrics_dict = {}
         for tag, tensor in tag2tensor.items():
-            metrics_dict[tag] = self.get_metric_value(tensor, eps)
+            try:
+                metrics_dict[tag] = self.get_metric_value(tensor, eps)
+                if torch.isnan(metrics_dict[tag]):
+                    print_warn_log(f'nan when calculate metric for {tag}')
+            except RuntimeError as e:
+                metrics_dict[tag] = torch.tensor(torch.nan)
         return metrics_dict
 
 @register_config_metric("min")
@@ -75,6 +95,19 @@ class MinMetric(Metric):
             summary_writer.add_scalar(f'{key}_min', min_value, step)
 
 
+@register_config_metric("mean")
+class MeanMetric(Metric):
+    @staticmethod
+    def get_metric_value(tensor, eps):
+        return get_mean(tensor)
+
+    @staticmethod
+    def metric_tensorboard(metric_name, summary_writer, metric_value, step):
+        for key in metric_value[0][metric_name].keys():
+            mean_value = sum([item[metric_name][key].item() for item in metric_value]) / len(metric_value)
+            summary_writer.add_scalar(f'{key}_mean', mean_value, step)
+
+
 @register_config_metric("max")
 class MaxMetric(Metric):
     @staticmethod
@@ -100,7 +133,6 @@ class NormMetric(Metric):
             norm_value = math.sqrt(sum([item[metric_name][key].item() for item in metric_value]))
             summary_writer.add_scalar(f'{key}_norm', norm_value, step)
 
-
 @register_config_metric("zeros")
 class ZerosMetric(Metric):
     @staticmethod
@@ -134,13 +166,25 @@ class IdentMetric(Metric):
         return tensor
 
     @staticmethod
-    def metric_tensorboard(metric_name, summary_writer, metric_value, step): #metric_value is a dict, key is parameter name and value is a list of scalar tensor
+    def metric_tensorboard(metric_name, summary_writer, metric_value, context): #metric_value is a dict, key is parameter name and value is a list of scalar tensor
         if len(metric_value) == 1:
             for key, value in metric_value[0][metric_name].items():
                 if not value:
                     continue
-                summary_writer.add_scalar(f'{key}_identical', value.item(), step)
-
+                summary_writer.add_scalar(f'{key}_identical', value.item(), context)
+
+def reorder_metric(metrics):
+    new_metrics = {}
+    for op, tag2metric in metrics.items():
+        for tag, metric in tag2metric.items():
+            if tag not in new_metrics:
+                new_metrics[tag] = {}
+            new_metrics[tag][op] = metric
+    return new_metrics
+    
+def sqrt_norm_metric(metrics):
+    if 'norm' in metrics:
+        metrics["norm"] = {tag:metric**0.5 for tag, metric in metrics["norm"].items()}
 
 def get_metrics(metric_name, tag2tensor, eps):
     try:
@@ -148,11 +192,38 @@ def get_metrics(metric_name, tag2tensor, eps):
         return fun_metric().get_metrics(tag2tensor, eps)
     except KeyError as e:
         raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
-
-
-def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step):
-    try:
-        fun_metric = config_metric_registry[metric_name]
-        return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step)
-    except KeyError as e:
-        raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
+    
+def write_metrics_tensorboard(ops, summary_writer, metric_value, step, prefix=''):
+    for metric_name in ops:
+        try:
+            fun_metric = config_metric_registry[metric_name]
+            fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step)
+        except KeyError as e:
+            raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
+
+def write_metrics_csv(ops, summary_writer, metric_value, step, prefix=''):
+    for metric_name in ops:
+        try:
+            fun_metric = config_metric_registry[metric_name]
+            fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step)
+            
+        except KeyError as e:
+            raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
+    
+    if not summary_writer.header:
+        if prefix == 'actv':
+            summary_writer.header = ['module_name']
+        else:
+            summary_writer.header = ['param_name']
+
+        if prefix in ['actv', 'actv_grad']:
+            summary_writer.header.extend([''.join(i) for i in itertools.product(ops, ['_input', '_output'])])
+        else:
+            summary_writer.header.extend(ops)
+
+        for key in metric_value[0][ops[0]].keys():
+            if Const.VPP_SEP in key:
+                summary_writer.header.insert(0, 'vpp_stage')
+            break
+    summary_writer.write_csv(prefix, step)
+    summary_writer.header = []
diff --git a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py b/debug/accuracy_tools/monitor/monitor/module_spec_verifier.py
similarity index 74%
rename from debug/accuracy_tools/kj600/kj600/module_spec_verifier.py
rename to debug/accuracy_tools/monitor/monitor/module_spec_verifier.py
index 395aa82f17a87cdf742a8294e29ccb1c32081200..f2abc688347a66725aeb4e457194b33941c8e7db 100644
--- a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py
+++ b/debug/accuracy_tools/monitor/monitor/module_spec_verifier.py
@@ -2,14 +2,8 @@ import json
 import re
 import abc
 import torch
-from kj600.utils import check_file_valid_readable
 
-
-def get_config(file_path='config.json'):
-    check_file_valid_readable(file_path)
-    with open(file_path, 'r') as file:
-        config = json.load(file)
-        return config
+from monitor.utils import print_warn_log
 
 # 用于存储所有validator实现类的注册表
 config_validator_registry = {}
@@ -35,22 +29,25 @@ class ConfigValidator(metaclass=abc.ABCMeta):
 class TensorValidator(ConfigValidator):
     def check_pattern_match(self, config_spec:str):
         pattern = re.compile(r"tensor")
-        return pattern.match(config_spec)    
+        return pattern.match(config_spec)
 
     def validate(self, actual_data, module_name:str, data_type:str, pattern_match):
         if not torch.is_tensor(actual_data):
             raise ValueError(f"Format of {module_name} {data_type} does not match the required format 'tensor' in config.")
-        return None
 
 
 @register_config_validator    
 class TupleValidator(ConfigValidator):
     def check_pattern_match(self, config_spec:str):
-        pattern = re.compile(r"tuple\[(\d+)\]:(\d+)")
-        return pattern.match(config_spec)  
+        pattern = re.compile(r"tuple\[(\d+)\]:?(\d+)?")
+        return pattern.match(config_spec)
     
     def validate(self, actual_data, module_name: str, data_type: str, pattern_match):
-        length, index = map(int, pattern_match.groups())
+        length, index = pattern_match.groups()
+        if index is None:
+            index = 0
+        length = int(length)
+        index = int(index)
         if not (0 <= index < length):
             raise ValueError(f"Format of {module_name} {data_type} in config.json does not match the required format 'tuple[x]:y'. y must be greater than or equal to 0 and less than x.")
         if not isinstance(actual_data, tuple):
@@ -61,10 +58,15 @@ class TupleValidator(ConfigValidator):
 
 
 def validate_config_spec(config_spec:str, actual_data, module_name:str, data_type:str):
+    focused_col = None
     for _, validator_cls in config_validator_registry.items():
         config_validator = validator_cls()
         pattern_match = config_validator.check_pattern_match(config_spec)
         if pattern_match:
-            focused_col = config_validator.validate(actual_data, module_name, data_type, pattern_match)
+            try:
+                focused_col = config_validator.validate(actual_data, module_name, data_type, pattern_match)
+            except ValueError as e:
+                print_warn_log(str(e))
             return focused_col
-    raise ValueError(f"config spec in {module_name} {data_type} not supported, expected spec:'tuple\[(\d+)\]:(\d+)' or 'tensor', actual spec: {config_spec}.")
+    print_warn_log(f"config spec in {module_name} {data_type} not supported, expected spec:'tuple\[(\d+)\]:(\d+)' or 'tensor', actual spec: {config_spec}.")
+    return focused_col
diff --git a/debug/accuracy_tools/monitor/monitor/optimizer_collect.py b/debug/accuracy_tools/monitor/monitor/optimizer_collect.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde15434ef9f4b99a4c1b515e50ad6dd67561d79
--- /dev/null
+++ b/debug/accuracy_tools/monitor/monitor/optimizer_collect.py
@@ -0,0 +1,284 @@
+from abc import ABC, abstractmethod
+from collections import defaultdict, namedtuple
+import torch
+import torch.distributed as dist
+
+from monitor.utils import print_warn_log, print_error_log
+
+
+def print_rank_0(message):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(message)
+    else:
+        print(message)
+
+
+MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio"))
+MV_Grad_Result = namedtuple('MV_Grad_Result', ("exp_avg", "exp_avg_sq", "update", "ratio", "grad"))
+
+
+class OptimizerMon(ABC):
+    wrapped_optimizer = None
+
+    def __init__(self) -> None:
+        self.fp16_to_fp32_param = {}
+        self.is_stage3 = False
+
+    @classmethod
+    def set_wrapped_optimizer(cls, wrapped_optimizer):
+        cls.wrapped_optimizer = wrapped_optimizer
+
+    @abstractmethod
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        pass
+
+    def _fetch_mv_in_adam(self, monitor, torch_opt, params2name):
+        exp_avg_dict = defaultdict(float)
+        exp_avg_sq_dict = defaultdict(float)
+        update_dict = defaultdict()
+        ratio_dict = defaultdict()
+
+        for param, name in params2name.items():
+            if param in self.fp16_to_fp32_param:
+                param = self.fp16_to_fp32_param[param]
+
+            if param in torch_opt.state:
+                state_param = torch_opt.state.get(param, None)
+                exp_avg = state_param.get("exp_avg", None)
+                exp_avg_sq = state_param.get("exp_avg_sq", None)
+                if exp_avg is None or exp_avg_sq is None:
+                    print_warn_log(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.")
+                    continue
+                if monitor.mv_distribution:
+                    exp_avg_dict[name] = exp_avg
+                    exp_avg_sq_dict[name] = exp_avg_sq
+                if monitor.mg_direction:
+                    exp_avg_dict[name] = exp_avg
+                if monitor.ur_distribution:
+                    if 'step' in state_param:
+                        step = state_param['step']  # Optimizer from pytorch or FusedAdam from apex(used by megatron)
+                    elif 'step' in torch_opt.param_groups[0]:
+                        step = torch_opt.param_groups[0]['step']  # AdamW from mindspeed
+                    else:
+                        print_warn_log(f"step of {name} is None, maybe something wrong happened.")
+                        continue
+                    exp_avg_hat = exp_avg / (1 - torch_opt.defaults['betas'][0] ** step)
+                    exp_avg_sq_hat = exp_avg_sq / (1 - torch_opt.defaults['betas'][1] ** step)
+                    update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + torch_opt.defaults['eps'])
+                    ratio_dict[name] = exp_avg_hat / torch.sqrt(exp_avg_sq_hat)
+                    monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
+                    monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
+        return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict)
+
+    def _fetch_mv_grad_in_adam(self, monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat):
+        exp_avg_dict = defaultdict(float)
+        exp_avg_sq_dict = defaultdict(float)
+        update_dict = defaultdict()
+        ratio_dict = defaultdict()
+        param2name = defaultdict()
+        fp32_partitioned_groups_flat_grad = defaultdict()
+        mix_prec_opt = OptimizerMon.wrapped_optimizer
+        partition_id = dist.get_rank()
+
+        def get_flatten_grad(self, optimizer, group_idx):
+            if self.is_stage3 or optimizer.cpu_offload:
+                return fp32_partitioned_groups_flat[group_idx].grad
+            elif fp32_partitioned_groups_flat[group_idx].grad is None:
+                if partition_id == dist.get_world_size() - 1:
+                    fp32_partitioned_groups_flat_grad = optimizer.flatten_dense_tensors_aligned(
+                        optimizer.averaged_gradients[group_idx],
+                        int(optimizer.partition_size[group_idx])
+                    ).to(fp32_partitioned_groups_flat[group_idx].dtype)
+                else:
+                    fp32_partitioned_groups_flat_grad = optimizer.flatten(
+                        optimizer.averaged_gradients[group_idx]
+                    ).to(fp32_partitioned_groups_flat[group_idx].dtype)
+                return fp32_partitioned_groups_flat_grad
+            else:
+                return fp32_partitioned_groups_flat[group_idx].grad
+
+        for group_idx in range(len(fp32_partitioned_groups_flat)):
+            fp32_partitioned_groups_flat_grad[group_idx] = get_flatten_grad(self, mix_prec_opt, group_idx)
+
+        for name in params2name.values():
+            start_idx, end_idx, group_idx, group_with_rank = name2indices[name]
+            if group_with_rank != partition_id and isinstance(group_with_rank, int):
+                continue
+            fp32_param = fp32_partitioned_groups_flat[group_idx][start_idx: end_idx]
+            fp32_param.grad = fp32_partitioned_groups_flat_grad[group_idx][start_idx: end_idx]
+            param2name[fp32_param] = name
+            state_param = list(mix_prec_opt.state.values())[0]
+            exp_avg = state_param.get("exp_avg", None)
+            exp_avg_sq = state_param.get("exp_avg_sq", None)
+            if exp_avg is None or exp_avg_sq is None:
+                print_warn_log(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.")
+                continue
+            exp_avg = exp_avg[start_idx: end_idx]
+            exp_avg_sq = exp_avg_sq[start_idx: end_idx]
+            if monitor.mv_distribution:
+                exp_avg_dict[name] = exp_avg
+                exp_avg_sq_dict[name] = exp_avg_sq
+            if monitor.mg_direction:
+                exp_avg_dict[name] = exp_avg
+            if monitor.ur_distribution:
+                if 'step' in state_param:
+                    step = state_param['step']  # Optimizer from pytorch or FusedAdam from apex(used by megatron)
+                elif 'step' in torch_opt.param_groups[0]:
+                    step = torch_opt.param_groups[0]['step']  # AdamW from mindspeed
+                else:
+                    print_warn_log(f"step of {name} is None, maybe something wrong happened.")
+                    continue
+                exp_avg_hat = exp_avg / (1 - torch_opt.defaults['betas'][0] ** step)
+                exp_avg_sq_hat = exp_avg_sq / (1 - torch_opt.defaults['betas'][1] ** step)
+                update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + torch_opt.defaults['eps'])
+                ratio_dict[name] = exp_avg_hat / torch.sqrt(exp_avg_sq_hat)
+                monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
+                monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
+        del fp32_partitioned_groups_flat_grad
+        return MV_Grad_Result(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict,
+                              grad=param2name)
+
+
+class MixPrecisionOptimizerMon(OptimizerMon):
+    # parameter tensors we want to monitor and their names are in params2name_dict
+    # base_optimizer is pytorch optimizer, wrapped_optimizer is a normal object with  base_optimizer
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        mix_prec_opt = self.wrapped_optimizer
+
+        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
+            for fp16_group, fp32_group in zip(mix_prec_opt.float16_groups, mix_prec_opt.fp32_from_float16_groups):
+                for fp16_param, fp32_param in zip(fp16_group, fp32_group):
+                    self.fp16_to_fp32_param[fp16_param] = fp32_param
+        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+
+
+class MegatronDistributedOptimizerMon(OptimizerMon):
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        mix_prec_opt = self.wrapped_optimizer
+        if not (hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt,
+                                                                          "shard_fp32_from_float16_groups")):
+            raise Exception("megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, \
+                             if not, please check megatron-lm version")
+        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
+            for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups,
+                                                    mix_prec_opt.shard_fp32_from_float16_groups):
+                for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group):
+                    self.fp16_to_fp32_param[fp16_param] = shard_fp32_param
+
+        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+
+
+class MegatronFP32OptimizerMon(OptimizerMon):
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+
+
+class DeepSpeedZeroOptimizerStage0Mon(OptimizerMon):
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+
+
+class DeepSpeedZeroOptimizerStage3Mon(OptimizerMon):
+    def get_param_index(self, params2name, name2index):
+        mix_prec_opt = OptimizerMon.wrapped_optimizer
+        fp16_groups = mix_prec_opt.fp16_partitioned_groups
+        name2indices = defaultdict()
+        index_length = defaultdict()
+        index = 0
+        idx = 0
+        for group_idx, fp16_group in enumerate(fp16_groups):
+            for param in fp16_group:
+                param_length = len(param.flatten())
+                index_length[idx] = (index, index + param_length, group_idx)
+                index += param_length
+                idx += 1
+        for _, name in params2name.items():
+            idx = name2index[name]
+            start_idx, end_idx, group_idx = index_length[idx]
+            name2indices[name] = (start_idx, end_idx, group_idx, None)
+        return name2indices
+
+    def fetch_mv(self, monitor, torch_opt, params2name, name2indices):
+        self.is_stage3 = True
+        mix_prec_opt = OptimizerMon.wrapped_optimizer
+        fp32_partitioned_groups_flat = mix_prec_opt.fp32_partitioned_groups_flat
+        return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
+
+
+class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
+    def get_param_index(self, params2name, name2index):
+        mix_prec_opt = OptimizerMon.wrapped_optimizer
+        padding = mix_prec_opt.groups_padding
+        world_size = dist.get_world_size()
+        fp32_length = [0]
+        for fp32_group_index, single_partition_of_fp32_group in enumerate(mix_prec_opt.single_partition_of_fp32_groups):
+            fp32_length.append(len(single_partition_of_fp32_group) * world_size + fp32_length[fp32_group_index])
+
+        def get_group_index(fp32_length, world_size, index):
+            for i in range(len(fp32_length) - 1):
+                if fp32_length[i] <= index < fp32_length[i + 1]:
+                    interval_start = fp32_length[i]
+                    interval_length = fp32_length[i + 1] - fp32_length[i]
+                    sub_interval_length = interval_length // world_size
+                    sub_index = (index - interval_start) // sub_interval_length
+                    sub_interval_start = interval_start + sub_index * sub_interval_length
+                    return sub_interval_start, min(sub_index, world_size - 1)
+            return fp32_length[-1], 0
+
+        bf16_groups = []
+        name2indices = defaultdict()
+        index_length = defaultdict()
+        index = 0
+        idx = 0
+        for group_idx, bf16_group in enumerate(mix_prec_opt.bit16_groups):
+            bf16_groups.extend(bf16_group)
+            for param in bf16_group:
+                param_length = len(param.flatten())
+                group_index, group_with_rank = get_group_index(fp32_length, world_size, index)
+                index_length[idx] = (index, index + param_length, group_idx, group_index, group_with_rank)
+                index += param_length
+                idx += 1
+        group_length = len(bf16_groups) / len(mix_prec_opt.bit16_groups)
+        for _, name in params2name.items():
+            name_index = name2index[name]
+            start_idx, end_idx, group_idx, group_index, group_with_rank = index_length[name_index]
+            need_padding = True if group_with_rank == world_size - 1 else False
+            new_start_idx = start_idx - group_index
+            new_end_idx = end_idx - group_index
+            if need_padding and group_length - 1 <= name_index <= len(bf16_groups) - 1 and name_index % (
+                    group_length - 1) == 0:
+                new_end_idx -= padding[int(name_index // (group_length - 1) - 1)]
+            name2indices[name] = (new_start_idx, new_end_idx, group_idx, group_with_rank)
+        return name2indices
+
+    def fetch_mv(self, monitor, torch_opt, params2name, name2indices):
+        mix_prec_opt = OptimizerMon.wrapped_optimizer
+        fp32_partitioned_groups_flat = mix_prec_opt.single_partition_of_fp32_groups
+        return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
+
+
+class DummyOptimizerMon(OptimizerMon):
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        return MVResult(exp_avg=None, exp_avg_sq=None, update=None, ratio=None)
+
+
+class OptimizerMonFactory:
+    _optimizer_mon_map = {
+        "Megatron_Float16OptimizerWithFloat16Params": MixPrecisionOptimizerMon,
+        "Megatron_DistributedOptimizer": MegatronDistributedOptimizerMon,
+        "Megatron_FP32Optimizer": MegatronFP32OptimizerMon,
+        "DeepSpeedZeroOptimizer_Stage0": DeepSpeedZeroOptimizerStage0Mon,
+        "DeepSpeedZeroOptimizer_Stage1_or_2": DeepSpeedZeroOptimizerStage1or2Mon,
+        "DeepSpeedZeroOptimizer_Stage3": DeepSpeedZeroOptimizerStage3Mon,
+        "unknown": DummyOptimizerMon
+    }
+
+    @staticmethod
+    def create_optimizer_mon(opt_ty: str):
+        if not opt_ty:
+            return DummyOptimizerMon()
+        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(opt_ty)
+        if not optimizer_mon_class:
+            raise Exception("opt_ty should be one of: " + ", ".join(OptimizerMonFactory._optimizer_mon_map.keys()))
+        return optimizer_mon_class()
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py b/debug/accuracy_tools/monitor/monitor/unittest/cc_utils.py
similarity index 94%
rename from debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py
rename to debug/accuracy_tools/monitor/monitor/unittest/cc_utils.py
index aa1ff688ec1c417204fc067e636535fef8a35bc0..628e42a1ba2e8cedc2cf40053dfa2c2e96cc9055 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/cc_utils.py
@@ -11,8 +11,8 @@ except:
     BACKEND = 'nccl'
     DEVICE = 'cuda'
 
-from kj600.features import square_sum, get_max, get_min, get_zeros
-from kj600.module_hook import CommunicationContext
+from monitor.features import square_sum, get_max, get_min, get_zeros
+from monitor.module_hook import CommunicationContext
 
 
 OP_FUNCS = {
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_basic_functions.json b/debug/accuracy_tools/monitor/monitor/unittest/config_basic_functions.json
similarity index 100%
rename from debug/accuracy_tools/kj600/kj600/unittest/config_basic_functions.json
rename to debug/accuracy_tools/monitor/monitor/unittest/config_basic_functions.json
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_cc.json b/debug/accuracy_tools/monitor/monitor/unittest/config_cc.json
similarity index 100%
rename from debug/accuracy_tools/kj600/kj600/unittest/config_cc.json
rename to debug/accuracy_tools/monitor/monitor/unittest/config_cc.json
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_cc_codeline_ranks.json b/debug/accuracy_tools/monitor/monitor/unittest/config_cc_codeline_ranks.json
similarity index 49%
rename from debug/accuracy_tools/kj600/kj600/unittest/config_cc_codeline_ranks.json
rename to debug/accuracy_tools/monitor/monitor/unittest/config_cc_codeline_ranks.json
index 720fbb9dd0ee639a412c4a7e62b3a6a73fce227d..f139e9b27557c11a02060e686043e83d2120f1de 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/config_cc_codeline_ranks.json
+++ b/debug/accuracy_tools/monitor/monitor/unittest/config_cc_codeline_ranks.json
@@ -2,7 +2,7 @@
     "targets": {
         "foo": {}
     },
-    "cc_distribution": {"enable": true, "cc_codeline":["kj600/unittest/test_cc_codeline_ranks.py\\[19\\]"]},
+    "cc_distribution": {"enable": true, "cc_codeline":["monitor/unittest/test_cc_codeline_ranks.py\\[19\\]"]},
     "module_ranks": [1],
     "ops":["max","min","norm","zeros"]
 }
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_cc_logonly.json b/debug/accuracy_tools/monitor/monitor/unittest/config_cc_logonly.json
similarity index 100%
rename from debug/accuracy_tools/kj600/kj600/unittest/config_cc_logonly.json
rename to debug/accuracy_tools/monitor/monitor/unittest/config_cc_logonly.json
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/expected_cc_log.json b/debug/accuracy_tools/monitor/monitor/unittest/expected_cc_log.json
similarity index 40%
rename from debug/accuracy_tools/kj600/kj600/unittest/expected_cc_log.json
rename to debug/accuracy_tools/monitor/monitor/unittest/expected_cc_log.json
index 8f2edd7ecdb373242f40ae938ca9a880a45e3264..8204f4a5d5feea0aaf588546700674fa29f49e7e 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/expected_cc_log.json
+++ b/debug/accuracy_tools/monitor/monitor/unittest/expected_cc_log.json
@@ -3,8 +3,8 @@
         [
             "|torch.float32||",
             "0|1",
-            "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[18]   test_all_gather",
-            "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[40]   main",
+            "/home/jovyan/workspace/kj_dev/monitor/unittest/test_cc_log_only.py[18]   test_all_gather",
+            "/home/jovyan/workspace/kj_dev/monitor/unittest/test_cc_log_only.py[40]   main",
             "<string>[1]   <module>"
         ]
     ],
@@ -12,8 +12,8 @@
         [
             "torch.float32|||",
             "0|1",
-            "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[23]   test_all_reduce",
-            "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[41]   main",
+            "/home/jovyan/workspace/kj_dev/monitor/unittest/test_cc_log_only.py[23]   test_all_reduce",
+            "/home/jovyan/workspace/kj_dev/monitor/unittest/test_cc_log_only.py[41]   main",
             "<string>[1]   <module>"
         ]
     ]
diff --git a/debug/accuracy_tools/monitor/monitor/unittest/test_anomaly_detect.py b/debug/accuracy_tools/monitor/monitor/unittest/test_anomaly_detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4ca6b8801bb3508ccbafadfbfbda32d8421311a
--- /dev/null
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_anomaly_detect.py
@@ -0,0 +1,19 @@
+import unittest
+from unittest import TestCase
+
+from monitor.anomaly_detect import BaseWriterWithAD
+
+
+class TestBaseWriterWithAD(TestCase):
+    def test_update_tag2scalars(self):
+        writer = BaseWriterWithAD('', None, None)
+        writer._update_tag2scalars('tag1', 1.0)
+        self.assertEqual(writer.tag2scalars['tag1']['avg'], 1.0)
+        self.assertEqual(writer.tag2scalars['tag1']['count'], 1)
+        writer._update_tag2scalars('tag1', 2.0)
+        self.assertEqual(writer.tag2scalars['tag1']['avg'], 1.5)
+        self.assertEqual(writer.tag2scalars['tag1']['count'], 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_anomaly_inform.py b/debug/accuracy_tools/monitor/monitor/unittest/test_anomaly_inform.py
similarity index 96%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_anomaly_inform.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_anomaly_inform.py
index 1ad76b919eb5fcce24539bf777d368047e6458c4..ab2ec0bcc9ae72fbefbcb59b86ff832c771ce76e 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_anomaly_inform.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_anomaly_inform.py
@@ -1,7 +1,7 @@
 import uuid
 import unittest
 
-from kj600.anomaly_inform import AnomalyInformFactory
+from monitor.anomaly_inform import AnomalyInformFactory
 
 
 class TestAnomalyInform(unittest.TestCase):
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_basic_functions.py b/debug/accuracy_tools/monitor/monitor/unittest/test_basic_functions.py
similarity index 94%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_basic_functions.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_basic_functions.py
index b7cdd3385b575b231702411fb01ebca2b67613bf..dc77fa39f1e9aeec371ad7105707c8a3c5ad7f0a 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_basic_functions.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_basic_functions.py
@@ -8,7 +8,7 @@ try:
     device = torch.device('npu:0')
 except ModuleNotFoundError:
     device = torch.device('cpu')
-from kj600.module_hook import TrainerMon
+from monitor.module_hook import TrainerMon
 
 from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 
@@ -30,13 +30,13 @@ class ToyDataset(torch.utils.data.Dataset):
     def __getitem__(self, idx):
         return self.data[idx].to(device), self.labels[idx].to(device)
 def get_file_path():
-    output_dir = os.environ.get("KJ600_OUTPUT_DIR")
+    output_dir = os.environ.get("MONITOR_OUTPUT_DIR")
     for root1, dirs, files in os.walk(output_dir):
         for root2, dir, file in os.walk(os.path.join(root1, dirs[-1])):
             return os.path.join(root2, file[0])
 
 def get_config():
-    os.environ["KJ600_OUTPUT_DIR"] = "./test_kj600_output"
+    os.environ["MONITOR_OUTPUT_DIR"] = "./test_monitor_output"
     with open("config_basic_functions.json", 'r') as file:
         config_test = json.load(file)
     return config_test
@@ -55,7 +55,7 @@ def get_tensorbaord(event_file_path):
     return scalers_tag, images_tag
 
 def clean_output():
-    folder_path = os.environ.get("KJ600_OUTPUT_DIR")
+    folder_path = os.environ.get("MONITOR_OUTPUT_DIR")
     if os.path.exists(folder_path):
         shutil.rmtree(folder_path)
 
@@ -86,9 +86,9 @@ def train():
         loss.backward()
         optimizer.step()
 
-class TestKj600(unittest.TestCase):
+class TestMonitor(unittest.TestCase):
     def __init__(self, method_name: str) -> None:
-        super(TestKj600, self).__init__(method_name)
+        super(TestMonitor, self).__init__(method_name)
         self.config_test = get_config()
         self.event_file_path = None
         self.scalers_tag = None
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_cc.py b/debug/accuracy_tools/monitor/monitor/unittest/test_cc.py
similarity index 98%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_cc.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_cc.py
index b5e92417a41e09539a00761e743670ba1b409ff7..3690ceef34afa2c8b4f95e1de9d513c48cf13cdd 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_cc.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_cc.py
@@ -5,8 +5,8 @@ import torch
 from torch import nn
 from torch import distributed as dist
 import torch.multiprocessing as mp
-from kj600.module_hook import TrainerMon
-from kj600.unittest.cc_utils import *
+from monitor.module_hook import TrainerMon
+from monitor.unittest.cc_utils import *
 
 DEBUG = False
 DIM = 2
@@ -231,7 +231,7 @@ def main(rank, world_size):
     steps = 2
 
     net = Model()
-    monitor = TrainerMon("kj600/unittest/config_cc.json", opt_ty="Megatron_Float16OptimizerWithFloat16Params")
+    monitor = TrainerMon("monitor/unittest/config_cc.json", opt_ty="Megatron_Float16OptimizerWithFloat16Params")
     # monitor = None
     # monitor.hook_optimizer() # to enable tb
     optimizer = torch.optim.Adam(net.parameters())
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_codeline_ranks.py b/debug/accuracy_tools/monitor/monitor/unittest/test_cc_codeline_ranks.py
similarity index 91%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_cc_codeline_ranks.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_cc_codeline_ranks.py
index d635441e155736340648b31dc1eab3d61d03f2fd..6311e3e51ad6c876a996994df38ed45804b7b45e 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_codeline_ranks.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_cc_codeline_ranks.py
@@ -3,8 +3,8 @@ sys.path.append(".")
 import torch
 from torch import distributed as dist
 import torch.multiprocessing as mp
-from kj600.module_hook import TrainerMon
-from kj600.unittest.cc_utils import *
+from monitor.module_hook import TrainerMon
+from monitor.unittest.cc_utils import *
 
 @wrap_reset
 def test_all_gather(context, rank, target_rank, world_size, async_op):
@@ -32,7 +32,7 @@ def main(rank, world_size):
     async_op = False
 
     net = Model()
-    monitor = TrainerMon("kj600/unittest/config_cc_codeline_ranks.json")
+    monitor = TrainerMon("monitor/unittest/config_cc_codeline_ranks.json")
     target_rank = monitor.module_rank_list
     # monitor = None
     # monitor.hook_optimizer() # to enable tb
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_log_only.py b/debug/accuracy_tools/monitor/monitor/unittest/test_cc_log_only.py
similarity index 92%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_cc_log_only.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_cc_log_only.py
index d7508d4af51d0549105a92eea3b7ff717924aea4..5388e8f7cb92f31d1fe6c0b60ae6bfd0e1482e21 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_log_only.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_cc_log_only.py
@@ -5,8 +5,8 @@ import json
 import torch
 from torch import distributed as dist
 import torch.multiprocessing as mp
-from kj600.module_hook import TrainerMon
-from kj600.unittest.cc_utils import *
+from monitor.module_hook import TrainerMon
+from monitor.unittest.cc_utils import *
 
 
 with open(os.path.join(os.path.dirname(__file__), 'expected_cc_log.json')) as f:
@@ -30,7 +30,7 @@ def main(rank, world_size):
     async_op = False
 
     net = Model()
-    monitor = TrainerMon("kj600/unittest/config_cc_logonly.json")
+    monitor = TrainerMon("monitor/unittest/config_cc_logonly.json")
     monitor.hook_optimizer() # to enable tb
     optimizer = torch.optim.Adam(net.parameters())
     cc_context = monitor.cc_context
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_database.py b/debug/accuracy_tools/monitor/monitor/unittest/test_database.py
similarity index 96%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_database.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_database.py
index a9046d9c07ebb1cdf3f16490554f7223d893e51e..c2295658fc5605cbb9e448f68972bc44a471e06f 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_database.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_database.py
@@ -5,7 +5,7 @@ from unittest import TestCase
 
 from sqlalchemy import inspect
 
-from kj600.database import Database, ExceptionMessage
+from monitor.database import Database, ExceptionMessage
 
 
 class TestDatabase(TestCase):
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_features.py b/debug/accuracy_tools/monitor/monitor/unittest/test_features.py
similarity index 95%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_features.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_features.py
index bc8c6dd71ab4e0bf708cf3d97d02dab3a2ded9cc..5a59dc1526a2b999d9fd11d18702320ad4abe05a 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_features.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_features.py
@@ -2,7 +2,7 @@ import unittest
 import torch
 import torch.nn as nn
 import torch_npu
-from kj600.features import eff_rank
+from monitor.features import eff_rank
 
 
 class TestFeatureCalculation(unittest.TestCase):
diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py b/debug/accuracy_tools/monitor/monitor/unittest/test_module_hook.py
similarity index 89%
rename from debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py
rename to debug/accuracy_tools/monitor/monitor/unittest/test_module_hook.py
index f81312691d35825fad05b7ed04db352bc96b2c20..6f9db7643e7cba841124e03d5591fe0066062753 100644
--- a/debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_module_hook.py
@@ -8,7 +8,7 @@ try:
 except ModuleNotFoundError:
     device = torch.device('cpu')
 import torch.nn.functional as F
-from kj600.module_hook import TrainerMon # Modify PYTHONPATH to import TrainerMon
+from monitor.module_hook import TrainerMon # Modify PYTHONPATH to import TrainerMon
 #from hook_api import reg_grad_hook, reg_grad_one_hook, reg_module_backward_hook, reg_module_forward_hook
 #from torch.cuda.amp import GradScaler
 
@@ -23,7 +23,7 @@ from kj600.module_hook import TrainerMon # Modify PYTHONPATH to import TrainerMo
 #debugger = PD(dump_path="./dump/", hook_name="dump", step=[1, 2, 3], enable_dataloader=False)
 #debugger.configure_hook(mode="list", scope=["optim_Adam_step"], )
 
-parser = argparse.ArgumentParser(prog="kj600 debug", description="kj600 sample code", epilog="")
+parser = argparse.ArgumentParser(prog="monitor debug", description="monitor sample code", epilog="")
 parser.add_argument("-o", "--out_dir", type=str, default=".")
 args = parser.parse_args()
 DTYPE = torch.float32
@@ -54,7 +54,7 @@ config = {
 # reg_module_backward_hook(net, module_bwd_hook, config)
 optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
 
-hooker = TrainerMon('./kj600/unittest/config_1.json', opt_ty = 'Megatron_Float16OptimizerWithFloat16Params')
+hooker = TrainerMon('./monitor/unittest/config_1.json', opt_ty = 'Megatron_Float16OptimizerWithFloat16Params')
 hooker.hook_modules(model=net, global_batch_size=2, dp=1, micro_batch_size=2, fwd_or_bkd=0, params_have_main_grad=False)
 # hooker.hook_optimizer(optimizer)
 
diff --git a/debug/accuracy_tools/monitor/monitor/unittest/test_monitor.py b/debug/accuracy_tools/monitor/monitor/unittest/test_monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7017a6be5e01ad1b332c08e2332a47eee1de023
--- /dev/null
+++ b/debug/accuracy_tools/monitor/monitor/unittest/test_monitor.py
@@ -0,0 +1,145 @@
+import sys
+import os
+import re
+import argparse
+import pandas as pd
+from glob import glob
+from collections import defaultdict
+
+
+def parse_logfile(logfile):
+    grad_norm = []
+    step = []
+    with open(logfile) as f:
+        for line in f.readlines():
+            if 'consumed samples' in line:
+                grad_norm.append(float(re.findall('(?<=grad norm\: )[\d\.]*', line)[0]))
+                # step = int(re.findall('(?<=iteration)[ \d]*', line)[0])
+    return grad_norm
+
+
+def parse_monitor_output(output_dir):
+    reduced = {}
+    unreduced = {}
+    for dir in glob(output_dir+'*'):
+        rank = int(re.findall('(?<=rank)[\d]*', dir)[0])
+        unreduced[rank] = []
+        reduced[rank] = []
+        for file in os.listdir(dir):
+            # step = int(re.search("(?<=reduced\_)[\d]*", file)[0])
+            # if  step != 0:
+            #     continue
+            df = pd.read_csv(os.path.join(dir, file))
+            if '_unreduced_' in file:
+                unreduced[rank].append(df)
+                pass
+            elif '_reduced_' in file:
+                reduced[rank].append(df)
+            else:
+                print(f'unexpected file {file} in {dir}')
+    return reduced, unreduced
+        
+def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel):
+    steps = len(reduced[0])
+    world_size = len(reduced)
+    errors = []
+    for index, row in unreduced[0][0].iterrows():
+        param = row['param_name']
+        is_tp_duplicate = False
+        for step in range(2):
+            # sum reduced
+            reduced_mean = 0.
+            for rank in range(world_size):
+                if len(reduced[rank]) == 0:
+                    continue
+                df = reduced[rank][step]
+                value = list(df[df['param_name'] == param]['mean'])
+                if value == []:
+                    if step == 0:
+                        is_tp_duplicate = True
+                    continue
+                reduced_mean += value[0]
+
+            # sum unreduced
+            unreduced_mean = 0.
+            for rank in range(world_size):
+                df = unreduced[rank][step]
+                value = list(df[df['param_name'] == param]['mean'])
+                if value == []:
+                    continue
+                unreduced_mean += list(df[df['param_name'] == param]['mean'])[0]
+            
+            unreduced_mean /= dp_size
+            if is_tp_duplicate and (not sequence_parallel or 'embedding' in param):
+                unreduced_mean /= tp_size
+            try:
+                assert_equal(unreduced_mean, reduced_mean)
+            except AssertionError as e:
+                errors.append([param, step, e, is_tp_duplicate])
+    if errors:
+        print(errors)
+    else:
+        print(f'grad mean is in consist between unreduced grad and reduced grad monitord.')
+        
+    
+
+def assert_equal(a, b):
+    if b == 0 or a == 0:
+        return
+    if b == 0:
+        rel_diff = a
+    elif a == 0:
+        rel_diff = b
+    else:
+        rel_diff = abs(a/b-1)
+    assert rel_diff<0.01, f'{a}, {b}, {rel_diff}'
+
+
+def valid_total_norm(total_norm, reduced, duplicate_embedding):
+    steps = len(total_norm)
+    world_size = len(reduced)
+    errors = []
+    for step in range(steps):
+        calculated_norm = 0.
+        for rank in range(world_size):
+            if len(reduced[rank]) == 0:
+                if step == 0:
+                    print(f'rank {rank} is duplicated in dp group')
+                continue
+            for index, row in reduced[rank][step].iterrows():
+                if duplicate_embedding and 'word_embedding' in row['param_name']:
+                    continue
+                calculated_norm += row['norm']**2
+        try:
+            assert_equal(calculated_norm**0.5, total_norm[step])
+        except AssertionError as e:
+            errors.append([step, e])
+    if errors:
+        print('total norm errors: ', errors)
+    else:
+        print('grad norm in consist between training log and reduced gradients monitored')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--monitor_output', '-m', type=str, required=True, help='path prefix to the output of monitor e.g. monitor_output/Aug12_07-16')
+    parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file')
+    parser.add_argument('--tp_size', '-t', type=int, required=True, help='tp parallel size')
+    parser.add_argument('--dp_size', '-d', type=int, required=True, help='dp parallel size')
+    parser.add_argument('--pp_size', '-p', type=int, required=True, help='pp parallel size')
+    parser.add_argument('--untie_embeddings_and_output_weights', '-u', action="store_true", default=False, help='whether untie_embeddings_and_output_weights in pp parallel')
+    parser.add_argument('--sequence_parallel', '-s', action="store_true", default=False, help='whether sequence parallel is enabled. Add -s to store true')
+    
+    args = parser.parse_args()
+
+    assert args.tp_size > 0, 'if tp not enabled, set tp_size = 1'
+    assert args.dp_size > 0, 'if tp not enabled, set dp_size = 1'
+    assert args.pp_size > 0, 'if tp not enabled, set pp_size = 1'
+
+    total_norm = parse_logfile(args.logfile)
+    reduced, unreduced = parse_monitor_output(args.monitor_output)
+
+    duplicate_embedding = not args.untie_embeddings_and_output_weights and args.pp_size > 1 
+
+    valid_total_norm(total_norm, reduced, duplicate_embedding)
+    valid_reduce(reduced, unreduced, args.tp_size, args.dp_size, args.sequence_parallel)
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/utils.py b/debug/accuracy_tools/monitor/monitor/utils.py
similarity index 54%
rename from debug/accuracy_tools/kj600/kj600/utils.py
rename to debug/accuracy_tools/monitor/monitor/utils.py
index 53d47d9988647202bdb711afde38b94b51899b5a..05eec38871fd58af8d5d69bd9b24abcc2a4aaa1c 100644
--- a/debug/accuracy_tools/kj600/kj600/utils.py
+++ b/debug/accuracy_tools/monitor/monitor/utils.py
@@ -2,6 +2,12 @@ import os
 import time
 import sys
 import re
+import warnings
+import torch
+
+from monitor.const import Const
+
+warnings.filterwarnings("ignore", category=UserWarning, message="torch.distributed")
 
 FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024
 FILE_NAME_MAX_LENGTH = 255
@@ -44,12 +50,21 @@ def print_warn_log(warn_msg):
     """
     _print_log("WARNING", warn_msg)
 
+
 def get_param_struct(param):
-    if isinstance(param, tuple):
-        return f"tuple[{len(param)}]"
-    if isinstance(param, list):
-        return f"list[{len(param)}]"
-    return "tensor"
+    res = {}
+    if isinstance(param, (tuple, list)):
+        res['config'] = f'{type(param).__name__}[{len(param)}]'
+        for i, x in enumerate(param):
+            res[i] = f'size={tuple(x.shape)}, dtype={x.dtype}' if torch.is_tensor(x) else x
+    elif torch.is_tensor(param):
+        res['config'] = 'tensor'
+        res['tensor'] = f'size={tuple(param.shape)}, dtype={param.dtype}'
+    else:
+        res['config'] = f'{type(param)}'
+        print_warn_log(f'Not support type({type(param)}) now, please check the type of param {param}')
+    return res
+
 
 def check_link(path):
     abs_path = os.path.abspath(path)
@@ -107,4 +122,48 @@ def check_file_valid_readable(path):
 def check_file_valid_writable(path):
     check_file_valid(path)
     check_path_writability(path)
+    
+
+def make_file_safety(file_path: str, permission=0o640):
+    if os.path.islink(file_path):
+        raise RuntimeError(f"Invalid soft link path: {file_path}")
+    file_real_path = os.path.realpath(file_path)
+    if os.path.exists(file_real_path):
+        return
+    parent_path = os.path.dirname(file_real_path)
+    if not os.path.exists(parent_path):
+        os.makedirs(parent_path, mode=0o750, exist_ok=True)
+    if not os.access(parent_path, os.W_OK):
+        raise PermissionError(f"The path {parent_path} is not writable!")
+    try:
+        os.close(os.open(file_real_path, os.O_WRONLY | os.O_CREAT, permission))
+    except OSError as e:
+        raise RuntimeError("Can't create file: " + file_real_path) from e
+    os.chmod(file_real_path, permission)
+
+
+def create_directory(dir_path):
+    dir_path = os.path.realpath(dir_path)
+    try:
+        os.makedirs(dir_path, mode=0o750, exist_ok=True)
+    except OSError as ex:
+        raise RuntimeError("Failed to create directory. Please check the path permission or disk space.") from ex
+    
+def validate_ops(ops):
+    if not isinstance(ops, list):
+        raise Exception("ops should be a list")
+    if not ops:
+        raise Exception("ops is required to calculate metrics")
+    valid_ops = []
+    for op in ops:
+        if op not in Const.OP_LIST:
+            print_warn_log(f"given op {op} is not supported. Optional ops: {Const.OP_LIST}")
+        else:
+            valid_ops.append(op)
+    return valid_ops
+
+
+def validate_config(config):
+    ops = config.get("ops", [])
+    config["ops"] = validate_ops(ops)
     
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/visualizer.py b/debug/accuracy_tools/monitor/monitor/visualizer.py
similarity index 97%
rename from debug/accuracy_tools/kj600/kj600/visualizer.py
rename to debug/accuracy_tools/monitor/monitor/visualizer.py
index e1929bfa3fb338b1cb66cda80a128e83176bfcbf..151f1ea1c451a5f27d250df591c4d00f64a1a34c 100644
--- a/debug/accuracy_tools/kj600/kj600/visualizer.py
+++ b/debug/accuracy_tools/monitor/monitor/visualizer.py
@@ -1,7 +1,7 @@
 import torch
 import numpy as np
 import matplotlib.pyplot as plt
-from kj600.features import cal_histc
+from monitor.features import cal_histc
 
 
 class HeatmapVisualizer:
diff --git a/debug/accuracy_tools/kj600/pyproject.toml b/debug/accuracy_tools/monitor/pyproject.toml
similarity index 71%
rename from debug/accuracy_tools/kj600/pyproject.toml
rename to debug/accuracy_tools/monitor/pyproject.toml
index 5df968563345dd07ed477ec73b967b63c6e812a6..57a1be653c8baa0539732f68f8bcc1616bd9c6da 100644
--- a/debug/accuracy_tools/kj600/pyproject.toml
+++ b/debug/accuracy_tools/monitor/pyproject.toml
@@ -3,17 +3,16 @@ requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "kj600"
+name = "monitor"
 version = "0.0.1"
 dependencies = [
-    "torch",
-    "torch_npu",
     "torchvision",
     "tensorboard",
-    "matplotlib",
-    "sqlalchemy",
-    "pymysql"
+    "matplotlib"
 ]
 
 [tool.setuptools.packages]
-find = {}  # Scan the project directory with the default parameters
\ No newline at end of file
+find = {}  # Scan the project directory with the default parameters
+
+[tool.setuptools.package-data]
+monitor = ["distributed/*.yaml"]
\ No newline at end of file
diff --git "a/debug/accuracy_tools/kj600/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md" "b/debug/accuracy_tools/monitor/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md"
similarity index 100%
rename from "debug/accuracy_tools/kj600/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md"
rename to "debug/accuracy_tools/monitor/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md"
diff --git a/debug/accuracy_tools/msprobe/pytorch/__init__.py b/debug/accuracy_tools/msprobe/pytorch/__init__.py
index 482e850f7baa845bd831e0d4728e841661b9345b..dcdf4cb3a3adf09f341d985dd82e916348c14462 100644
--- a/debug/accuracy_tools/msprobe/pytorch/__init__.py
+++ b/debug/accuracy_tools/msprobe/pytorch/__init__.py
@@ -2,3 +2,5 @@ from .debugger.precision_debugger import PrecisionDebugger
 from .common.utils import seed_all
 from .compare.acc_compare import compare
 from .compare.distributed_compare import compare_distributed
+from .visualization.builder.graph_builder import GraphBuilder
+from .visualization.compare.graph_comparator import GraphComparator
diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py
index e214910566e6af920f48a8f2c4c4f7cb47b36b10..df20e612fd8789d120b4280b77346c7defde5e9d 100644
--- a/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py
+++ b/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py
@@ -960,8 +960,9 @@ def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False
     if npu_ops_queue:
         for npu_data in npu_ops_queue:
             get_un_match_accuracy(result, npu_data, md5_compare, summary_compare)
+    result_to_csv(md5_compare, summary_compare, stack_mode, result, output_csv_handle)
 
-    header = []
+def result_to_csv(md5_compare, summary_compare, stack_mode, result, output_csv_handle):
     if md5_compare:
         header = CompareConst.MD5_COMPARE_RESULT_HEADER[:]
     elif summary_compare:
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f623a48ae3b9607103b4af63bd8838d3d13c8a0b
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..graph.graph import Graph
+from ..graph.node_op import NodeOp
+from ..utils import load_json_file, load_data_json_file, save_json_file, GraphConst
+from .msprobe_adapter import get_input_output
+
+
+class GraphBuilder:
+    @staticmethod
+    def build(construct_path, data_path, model_name='DefaultModel'):
+        """
+        GraphBuilder的对外提供的构图方法
+        Args:
+            construct_path: construct.json路径
+            data_path: dump.json路径
+            model_name: 模型名字，依赖外部输入
+        Returns: Graph，代表图的数据结构
+        """
+        construct_dict = load_json_file(construct_path)
+        data_dict = load_data_json_file(data_path)
+        graph = Graph(model_name)
+        GraphBuilder._init_nodes(graph, construct_dict, data_dict)
+        return graph
+    
+    @staticmethod
+    def to_json(filename, graph_n, graph_b=None, tool_tip=None):
+        """
+        将graph导出成.vis文件的接口
+        Args:
+            filename: 输出文件路径
+            graph_n: Graph
+            graph_b: bench Graph，为空是只输出graph_b，不为空会同时输出两个graph，作为对比的结果
+            tool_tip: 在对比模型下输出的意见
+        """
+        result = {}
+        if graph_b:
+            result[GraphConst.JSON_NPU_KEY] = graph_n.to_dict()
+            result[GraphConst.JSON_BENCH_KEY] = graph_b.to_dict()
+        else:
+            result = graph_n.to_dict()
+        if tool_tip:
+            result[GraphConst.JSON_TIP_KEY] = tool_tip
+        save_json_file(filename, result)
+    
+    @staticmethod
+    def _init_nodes(graph, construct_dict, data_dict):
+        for subnode_id, upnode_id in construct_dict.items():
+            if upnode_id:
+                upnode_op = NodeOp.get_node_op(upnode_id)
+                upnode = GraphBuilder._create_or_get_node(graph, data_dict, upnode_op, upnode_id)
+            else:
+                upnode = graph.root
+            node_op = NodeOp.get_node_op(subnode_id)
+            GraphBuilder._create_or_get_node(graph, data_dict, node_op, subnode_id, upnode)
+
+    @staticmethod
+    def _create_or_get_node(graph, data_dict, op, name, upnode=None):
+        if name in graph.node_map:
+            node = graph.get_node(name)
+        else:
+            graph.add_node(op, name, upnode)
+            node = graph.get_node(name)
+            node_data = data_dict.get(name, {})
+            # 添加输入输出数据
+            input_data, output_data = get_input_output(node_data, node.id)
+            # 更新数据
+            node.set_input_output(input_data, output_data)
+        # 添加节点
+        node.add_upnode(upnode)
+        return node
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea0dfabedf7c482975094abdd981baa1afeb44e
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from ...compare.acc_compare import read_op, merge_tensor, get_accuracy, _do_multi_process
+from ....core.common.utils import task_dumppath_get
+from ..utils import GraphConst
+
+
+# 用于将节点名字解析成对应的NodeOp的规则
+op_patterns = [
+    r'^(Module)', #NodeOp.module
+    r'^(Tensor|Torch|Functional|NPU|VF|Distributed|Aten)' #NodeOp.function_api
+]
+
+
+def get_compare_mode(dump_path_param):
+    """
+    获得比较模式，包括summary、MD5和真实数据三种模式
+    Args:
+        dump_path_param: 调用acc_compare接口所依赖的参数
+    Returns: 0 summary mode, 1 md5 mode, 2 true data mode
+    """
+    summary_compare, md5_compare = task_dumppath_get(dump_path_param)
+    if summary_compare: 
+        compare_mode = GraphConst.SUMMARY_COMPARE
+    elif md5_compare:
+        compare_mode = GraphConst.MD5_COMPARE
+    else:
+        compare_mode = GraphConst.REAL_DATA_COMPARE
+    return compare_mode
+
+
+def run_real_data(dump_path_param, csv_path):
+    """
+    多进程运行生成真实数据
+    Args:
+        dump_path_param: 调用acc_compare接口所依赖的参数
+        csv_path: 生成文件路径
+    """
+    return _do_multi_process(dump_path_param, csv_path)
+
+
+def get_input_output(node_data, node_id):
+    """
+    将dump的原始数据进行拆解，分解为output和input两个数据
+    Args:
+        node_data: 属于单个节点的dump数据
+        node_id: 节点名字
+    """
+    input_data = {}
+    output_data = {}
+    op_parsed_list = read_op(node_data, node_id)
+    for item in op_parsed_list:
+        full_op_name = item.get('full_op_name', '')
+        if not full_op_name:
+            continue
+        splits = full_op_name.split('.')
+        if len(splits) <= GraphConst.OUTPUT_INDEX:
+            continue
+        if 'output' in splits[GraphConst.OUTPUT_INDEX]:
+            output_data[full_op_name] = item
+        else:
+            input_data[full_op_name] = item
+    return input_data, output_data
+
+
+def compare_data(data_dict_list1, data_dict_list2):
+    """
+    比较get_input_output中输出的结果是否结构一致，比较一致返回True
+    """
+    if len(data_dict_list1) != len(data_dict_list2):
+        return False
+    # 用于比较两个节点是否相等的关键字段
+    tag_keys = ['type', 'dtype', 'shape']
+    for key1, key2 in zip(data_dict_list1, data_dict_list2):
+        dict1 = data_dict_list1[key1]
+        dict2 = data_dict_list2[key2]
+        for tag_key in tag_keys:
+            tag_value1 = dict1.get(tag_key, None)
+            tag_value2 = dict2.get(tag_key, None)
+            if tag_value1 != tag_value2:
+                return False
+    return True
+
+
+def format_node_data(data_dict):
+    """
+    批量进行节点数据的输出
+    """
+    del_list = ['requires_grad', 'data_name', 'full_op_name']
+    for _, value in data_dict.items():
+        if not isinstance(value, dict):
+            continue
+        for item in del_list:
+            if item in value:
+                del value[item]
+        _format_data(value)
+    return data_dict
+
+
+def compare_node(node_ids, data_dicts, stack_json_data, is_summary_compare, is_md5_compare):
+    """
+    调用acc_compare.py中的get_accuracy获得精度对比指标
+    真实数据对比模式无法获得精度对比指标，需要调用多进程比对接口
+    Returns: 包含参数信息和对比指标（真实数据对比模式除外）的list
+    """
+    merge_n = _parse_node(node_ids[0], data_dicts[0], stack_json_data, is_summary_compare, is_md5_compare)
+    merge_b = _parse_node(node_ids[1], data_dicts[1], stack_json_data, is_summary_compare, is_md5_compare)
+    result = []
+    get_accuracy(result, merge_n, merge_b, is_summary_compare, is_md5_compare)
+    return result
+
+
+def _parse_node(node_id, data_dict, stack_json_data, is_summary_compare, is_md5_compare):
+    """
+    转换节点，使其能够作为acc_compare.py中的get_accuracy的入参
+    """
+    op_parsed_list = read_op(data_dict.get(node_id, {}), node_id)
+    if node_id in stack_json_data:
+        op_parsed_list.append(
+            {'full_op_name': node_id, 'full_info': stack_json_data[node_id]})
+    else:
+        op_parsed_list.append({'full_op_name': node_id, 'full_info': None})
+    result = merge_tensor(op_parsed_list, is_summary_compare, is_md5_compare)
+    if not result:
+        result['op_name'] = []
+    return result
+
+
+def _format_decimal_string(s):
+    """
+    使用正则表达式匹配包含数字、小数点和可选的百分号的字符串
+    """
+    pattern = re.compile(r'\d{1,20}\.\d{1,20}%?')
+    matches = pattern.findall(s)
+    for match in matches:
+        is_percent = match.endswith('%')
+        number_str = match.rstrip('%')
+        decimal_part = number_str.split('.')[1]
+        # 如果小数位数大于6，进行处理
+        if len(decimal_part) > GraphConst.ROUND_TH:
+            number_float = float(number_str)
+            formatted_number = f"{number_float:.{GraphConst.ROUND_TH}f}"
+            # 如果原来是百分数，加回百分号
+            if is_percent:
+                formatted_number += '%'
+            # 替换原字符串中的数值部分
+            s = s.replace(match, formatted_number)
+    return s
+
+
+def _format_data(data_dict):
+    """
+    格式化数据，小数保留6位，处理一些异常值
+    """
+    pattern = r'^[+-]?(\d+(.\d*)?|.\d+)([eE][+-]?\d+)$'
+    for key, value in data_dict.items():
+        if isinstance(value, str):
+            # 将单引号删掉，None换成null避免前端解析错误
+            value = value.replace("'", "").replace('None', 'null')
+            value = _format_decimal_string(value)
+        elif value is None or value == ' ':
+            value = 'null'
+        # 科学计数法1.123123123123e-11，格式化为1.123123e-11
+        elif isinstance(value, float) and len(str(value)) < GraphConst.STR_MAX_LEN and re.match(pattern, str(value)):
+            value = "{:.6e}".format(value)
+        elif isinstance(value, float):
+            value = round(value, GraphConst.ROUND_TH)
+        # Inf会走入这里，确保转成Inf。另外给其他不符合预期的类型做兜底方案
+        if not isinstance(value, (list, tuple, dict, str)):
+            value = str(value)
+        data_dict[key] = value
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d5f2972468adab8a436167d2f50eab9ace05873
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data
+from ..utils import GraphConst, load_json_file, load_data_json_file, get_csv_df
+from ..graph.graph import Graph
+from .mode_adapter import ModeAdapter
+
+
+class GraphComparator:
+    def __init__(self, graphs, data_paths, stack_path, output_path):
+        self.graph_n = graphs[0]
+        self.graph_b = graphs[1]
+        self._parse_param(data_paths, stack_path, output_path)
+
+    def compare(self):
+        """
+        比较函数，初始化结束后单独调用。比较结果写入graph_n
+        """
+        self._compare_nodes(self.graph_n.root)
+        self._postcompare()
+    
+    def add_compare_result_to_node(self, node, compare_result_list):
+        """
+        将比对结果添加到节点的输入输出数据中
+        Args:
+            node: 节点
+            compare_result_list: 包含参数信息和对比指标（真实数据对比模式除外）的list
+        """
+        # 真实数据比对，先暂存节点，在多进程对比得到精度指标后，再将指标添加到节点中
+        if self.ma.prepare_real_data(node):
+            return
+        compare_in_dict = {}
+        compare_out_dict = {}
+        # input和output对比数据分开
+        for item in compare_result_list:
+            if 'output' in item[0]:
+                compare_out_dict[item[0]] = item
+            else:
+                compare_in_dict[item[0]] = item
+        precision_status, precision_index, other_dict = self.ma.parse_result(node, [compare_in_dict, compare_out_dict])
+        node.data[GraphConst.JSON_STATUS_KEY] = precision_status
+        node.data[GraphConst.JSON_INDEX_KEY] = precision_index
+        node.data.update(other_dict)
+        if not precision_status:
+            self.ma.add_error_key(node.output_data)
+            node.get_suggestions()
+    
+    def _parse_param(self, data_paths, stack_path, output_path):
+        self.dump_path_param = {
+            'npu_json_path': data_paths[0],
+            'bench_json_path': data_paths[1],
+            'stack_json_path': stack_path,
+            'is_print_compare_log': True
+        }
+        self.output_path = output_path
+        compare_mode = get_compare_mode(self.dump_path_param)
+        self.ma = ModeAdapter(compare_mode)
+        self.data_n_dict = load_data_json_file(data_paths[0])
+        self.data_b_dict = load_data_json_file(data_paths[1])
+        self.stack_json_data = load_json_file(stack_path)
+
+    def _postcompare(self):
+        if not self.ma.is_real_data_compare():
+            return
+        df = get_csv_df(self.ma.is_md5_compare(), self.ma.is_summary_compare(), True, self.ma.csv_data)
+        df = run_real_data(self.dump_path_param, df)
+        compare_data_dict = {row[0]: row.tolist() for _, row in df.iterrows()}
+        for node in self.ma.compare_nodes:
+            precision_status, precision_index, _ = self.ma.parse_result(node, [compare_data_dict])
+            node.data[GraphConst.JSON_STATUS_KEY] = precision_status
+            node.data[GraphConst.JSON_INDEX_KEY] = precision_index
+            if not precision_status:
+                self.ma.add_error_key(node.output_data)
+                node.get_suggestions()
+    
+    def _compare_nodes(self, node_n):
+        #递归遍历NPU树中的节点，如果在Bench中找到具有相同名称的节点，检查他们的祖先和参数信息，检查一致则及逆行精度数据对比
+        #这里采用先序遍历，好处在于当这个节点被比较时，他的先序已经被匹配，这可以为后续的模糊匹配提供重要信息
+        node_b, ancestors = Graph.match(self.graph_n, node_n, self.graph_b)
+        if node_b:
+            ancestors.append(node_b.id)
+            node_n.add_link(node_b, ancestors)
+            # 真实数据比对只会得到基本信息，并没有精度指标，需要调用多进程对比接口
+            compare_result_list = compare_node([node_n.id, node_b.id], [self.data_n_dict, self.data_b_dict],
+                                            self.stack_json_data, self.ma.is_summary_compare(),
+                                            self.ma.is_md5_compare())
+            if compare_result_list:
+                self.ma.add_csv_data(compare_result_list)
+                self.add_compare_result_to_node(node_n, compare_result_list)
+        for subnode in node_n.subnodes:
+            self._compare_nodes(subnode)
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..d58f2078b6f8996a31c2f830ef5adf79bc7948c3
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from ....core.common.const import CompareConst, Const
+from ..utils import ToolTip, GraphConst, str2float
+
+
+class ModeAdapter:
+    def __init__(self, compare_mode):
+        self.compare_mode = compare_mode
+        self.csv_data = []
+        self.compare_nodes = []
+    
+    @staticmethod
+    def _add_md5_compare_data(node_data, compare_data_dict):
+        precision_status = True
+        for key, value in node_data.items():
+            if not isinstance(value, dict):
+                continue
+            compare_data = compare_data_dict.get(key)
+            if compare_data:
+                key_list = [GraphConst.JSON_MD5_KEY]
+                headers = CompareConst.MD5_COMPARE_RESULT_HEADER
+                id_list = [headers.index(x) for x in key_list]
+                ModeAdapter._match_data(value, compare_data, key_list, id_list)
+                # md5比对是否通过
+                if value.get(GraphConst.JSON_MD5_KEY) != CompareConst.PASS:
+                    precision_status = False 
+                node_data[key] = value
+        return precision_status
+    
+    @staticmethod
+    def _add_real_compare_data(node_data, compare_data_dict):
+        min_thousandth = float(1)
+        numbers = []
+        for key, value in node_data.items():
+            if not isinstance(value, dict):
+                continue
+            compare_data = compare_data_dict.get(key)
+            if compare_data:
+                key_list = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR,
+                            CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO]
+                headers = CompareConst.COMPARE_RESULT_HEADER
+                id_list = [headers.index(x) for x in key_list]
+                ModeAdapter._match_data(value, compare_data, key_list, id_list)
+                # 获取一个节点所有的输入或输出最小的双千指标
+                thousandth = value.get(CompareConst.ONE_THOUSANDTH_ERR_RATIO)
+                # 可能是None，可能是非数字内容str
+                try:
+                    thousandth = float(thousandth)
+                except (ValueError, TypeError):
+                    thousandth = None
+                if thousandth is not None:
+                    numbers.append(thousandth)
+                node_data[key] = value
+        # 双千指标都是None的异常情况
+        if not numbers:
+            min_thousandth = None
+        else:
+            min_thousandth = min(numbers + [min_thousandth])
+        return min_thousandth
+    
+    @staticmethod
+    def _add_summary_compare_data( node_data, compare_data_dict):
+        precision_status = True
+        max_relative_err = 0
+        for key, value in node_data.items():
+            if not isinstance(value, dict):
+                continue
+            compare_data = compare_data_dict.get(key)
+            if compare_data:
+                # 对应比对结果csv的列
+                key_list = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF,
+                            CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR,
+                            CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR]
+                headers = CompareConst.SUMMARY_COMPARE_RESULT_HEADER
+                id_list = [headers.index(x) for x in key_list]
+                ModeAdapter._match_data(value, compare_data, key_list, id_list)
+                # 相对误差大于0.5疑似有精度问题，小值域1e-3不比较相对误差
+                for index, item in enumerate(key_list[4:]):
+                    value_diff = value.get(key_list[index])
+                    if isinstance(value_diff, float) and value_diff != 0 and abs(value_diff) < GraphConst.SMALL_VALUE:
+                        value[item] = ToolTip.SMALL_VALUE_TIP.format(key_list[index])
+                        continue
+                    relative_err = str2float(value.get(item))
+                    max_relative_err = max(max_relative_err, relative_err)
+                node_data[key] = value
+        if max_relative_err > GraphConst.MAX_RELATIVE_ERR_TH:
+            precision_status = False
+        max_relative_err = 1 if max_relative_err > 1 else max_relative_err
+        precision_index = 1 - max_relative_err
+        return precision_status, precision_index
+    
+    @staticmethod
+    def _match_data(data_dict, compare_data, key_list, id_list):
+        """
+        绑定精度指标到node的input_data和output_data
+        """
+        if len(key_list) != len(id_list):
+            return
+        for id, key in zip(id_list, key_list):
+            data = compare_data[id]
+            if data is not None and 'nan' not in str(data) and str(data) != ' ':
+                data_dict[key] = data
+            else:
+                data_dict[key] = 'null'
+    
+    def parse_result(self, node, compare_data_dict):
+        """
+        根据结果返回数据，分别是precision_status，precision_index，和附加数据
+        """
+        other_dict = {}
+        if self.is_md5_compare():
+            precision_status_in = ModeAdapter._add_md5_compare_data(node.input_data, compare_data_dict[0])
+            precision_status_out = ModeAdapter._add_md5_compare_data(node.output_data, compare_data_dict[1])
+            # 所有输入输出md5对比通过，这个节点才算通过
+            precision_status = precision_status_in and precision_status_out
+            precision_index = 1 if precision_status else 0
+            other_result = CompareConst.PASS if precision_status else CompareConst.DIFF
+            other_dict[GraphConst.JSON_MD5_KEY] = other_result
+        elif self.is_summary_compare():
+            precision_status_in, precision_index_in = ModeAdapter._add_summary_compare_data(node.input_data, compare_data_dict[0])
+            precision_status_out, precision_index_out = ModeAdapter._add_summary_compare_data(node.output_data, compare_data_dict[1])
+            precision_status = precision_status_in and precision_status_out
+            precision_index = min(precision_index_in, precision_index_out)
+        else:
+            min_thousandth_in = ModeAdapter._add_real_compare_data(node.input_data, compare_data_dict[0])
+            min_thousandth_out = ModeAdapter._add_real_compare_data(node.output_data, compare_data_dict[0])
+            if min_thousandth_in and min_thousandth_out:
+                change_percentage = abs(min_thousandth_in - min_thousandth_out)
+            else:
+                change_percentage = 0
+            precision_status = True
+            if change_percentage > GraphConst.REAL_DATA_TH:
+                precision_status = False
+            precision_index = 0 if change_percentage > 1 else 1 - change_percentage
+        return precision_status, precision_index, other_dict
+    
+    def prepare_real_data(self, node):
+        """
+        为真实数据比较模式准备节点信息
+        """
+        if self.is_real_data_compare():
+            self.compare_nodes.append(node)
+            return True
+        return False
+    
+    def is_summary_compare(self):
+        return self.compare_mode == GraphConst.SUMMARY_COMPARE
+    
+    def is_md5_compare(self):
+        return self.compare_mode == GraphConst.MD5_COMPARE
+    
+    def is_real_data_compare(self):
+        return self.compare_mode == GraphConst.REAL_DATA_COMPARE
+    
+    def add_csv_data(self, compare_result_list):
+        if not self.is_real_data_compare():
+            return
+        self.csv_data.extend(compare_result_list)
+    
+    def add_error_key(self, node_data):
+        """
+        根据不同的模式进行提供不同错误信息
+        """
+        for key, value in node_data.items():
+            if not isinstance(value, dict):
+                continue
+            if self.is_summary_compare():
+                message = [CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR,
+                           CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR]
+            elif self.is_real_data_compare():
+                message = [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO]
+            else:
+                # 输出件优化
+                message = []
+            value[GraphConst.ERROR_KEY] = message
+            node_data[key] = value
+    
+    def get_tool_tip(self):
+        """
+        用于前端展示字段的具体含义
+        """
+        if self.is_summary_compare():
+            tips = {
+                CompareConst.MAX_DIFF: ToolTip.MAX_DIFF,
+                CompareConst.MIN_DIFF: ToolTip.MIN_DIFF,
+                CompareConst.MEAN_DIFF: ToolTip.MEAN_DIFF,
+                CompareConst.NORM_DIFF: ToolTip.NORM_DIFF}
+        elif self.is_md5_compare():
+            tips = {Const.MD5: ToolTip.MD5}
+        else:
+            tips = {
+                CompareConst.ONE_THOUSANDTH_ERR_RATIO: ToolTip.ONE_THOUSANDTH_ERR_RATIO,
+                CompareConst.COSINE: ToolTip.COSINE,
+                CompareConst.MAX_ABS_ERR: ToolTip.MAX_ABS_ERR,
+                CompareConst.MAX_RELATIVE_ERR: ToolTip.MAX_RELATIVE_ERR}
+        return tips
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..f04f367f591244a6d1ed48529d1fb4aae7cb2453
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .node_op import NodeOp
+from ..utils import Suggestions, GraphConst
+from ..builder.msprobe_adapter import format_node_data, compare_data
+
+
+class BaseNode:
+    def __init__(self, node_op, node_id, up_node=None):
+        self.op = node_op
+        self.id = node_id
+        self.data = {}
+        self.output_data = {}
+        self.input_data = {}
+        self.upnode = None
+        self.add_upnode(up_node)
+        self.subnodes = []
+        self.matched_node_link = []
+        self.suggestions = {}
+    
+    def __str__(self):
+        info = f'id:\t{self.id}'
+        return info
+    
+    def __eq__(self, other):
+        """
+        用来判断两个节点是否可以被匹配上，认为结构上是否一致
+        """
+        if not compare_data(self.input_data, other.input_data):
+            return False
+        if not compare_data(self.output_data, other.output_data):
+            return False
+        return True
+
+    def get_suggestions(self):
+        """
+        精度疑似有问题时，提供一些建议
+        """
+        if self.op == NodeOp.module:
+            self.suggestions[GraphConst.SUGGEST_KEY] = Suggestions.Module
+            self.suggestions[Suggestions.PTDBG] = Suggestions.PTDBG_URL
+        elif self.op == NodeOp.function_api:
+            self.suggestions[GraphConst.SUGGEST_KEY] = Suggestions.API
+            self.suggestions[Suggestions.API_ACCURACY_CHECKER] = Suggestions.API_ACCURACY_CHECKER_URL
+    
+    def set_input_output(self, input_data, output_data):
+        self.input_data = input_data
+        self.output_data = output_data
+
+    def add_upnode(self, node):
+        """
+        绑定upnode，用于对两个节点进行上下级关联
+        """
+        if not node or node.id == self.id or self.upnode:
+            return
+        self.upnode = node
+        node.subnodes.append(self)
+    
+    def add_link(self, node, ancestors):
+        """
+        在节点匹配成功后进行匹配数据的录入
+        Args:
+            node: 和self相互匹配的节点
+            ancestors: 对面节点的祖先信息
+        """
+        self.matched_node_link = ancestors
+        node.matched_node_link = ancestors
+
+    def to_dict(self):
+        """
+        输出数据
+        """
+        result = {}
+        result['id'] = self.id
+        result['node_type'] = self.op.value
+        result['data'] = self.data
+        result['output_data'] = format_node_data(self.output_data)
+        result['input_data'] = format_node_data(self.input_data)
+        result['upnode'] = self.upnode.id if self.upnode else 'None'
+        result['subnodes'] = [node.id for node in self.subnodes]
+        result['matched_node_link'] = self.matched_node_link
+        result['suggestions'] = self.suggestions
+        return result
+    
+    def get_ancestors(self):
+        """
+        获取节点所有祖先的列表
+        """
+        ancestors = []
+        current_node = self.upnode
+        while current_node:
+            ancestors.append(current_node.id)
+            current_node = current_node.upnode
+        return list(reversed(ancestors))
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bae10ad3fc8a041d3ef2e8fb707d40a22b42f19
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_node import BaseNode
+from .node_op import NodeOp
+from ..utils import GraphConst
+
+
+class Graph:
+    def __init__(self, model_name):
+        self.node_map = {}
+        self.add_node(NodeOp.module, model_name)
+        self.root = self.get_node(model_name)
+    
+    def __str__(self):
+        infos = [f'{str(self.node_map.get(node_id))}' for node_id in self.node_map]
+        info = "\n".join(infos)
+        return info
+        
+    @staticmethod
+    def match(graph_n, node_n, graph_b):
+        """
+        给定节点n，在另一个graph中匹配它对应的节点。前置条件是它的父节点匹配已经完成
+        目前采用完全匹配的方式，后续可能在这里加入一定的模糊匹配逻辑
+        返回匹配结果，匹配到的节点，以及祖先树。没匹配到则返回None, []
+        """
+        if not node_n or node_n.id not in graph_b.node_map:
+            return None, []
+        node_b = graph_b.node_map.get(node_n.id)
+        if node_n != node_b:
+            return None, []
+        ancestors_n = node_n.get_ancestors()
+        ancestors_b = node_b.get_ancestors()
+        if ancestors_n != ancestors_b:
+            return None, []
+        return node_b, ancestors_n
+    
+    @staticmethod
+    def dfs(node, result):
+        info = node.to_dict()
+        result[node.id] = info
+        for subnode in node.subnodes:
+            Graph.dfs(subnode, result)
+    
+    def add_node(self, node_op, node_id, up_node=None):
+        """
+        在graph中进行节点的添加
+        Args:
+            node_op: 需要添加的节点类型
+            node_id: 需要添加的节点id
+            up_node：对应节点的父节点
+        """
+        if node_id in self.node_map:
+            return
+        node = BaseNode(node_op, node_id, up_node)
+        self.node_map[node_id] = node
+    
+    def get_node(self, node_id):
+        """
+        返回节点，不存在返回None
+        """
+        return self.node_map.get(node_id, None)
+    
+    def to_dict(self):
+        """
+        用于数据输出
+        """
+        result = {}
+        result[GraphConst.JSON_ROOT_KEY] = self.root.id if self.root else 'None'
+        result[GraphConst.JSON_NODE_KEY] = {}
+        for node_id in self.node_map:
+            info = self.node_map.get(node_id).to_dict()
+            result[GraphConst.JSON_NODE_KEY][node_id] = info
+        return result
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1629caabd1989beac72646ea36efb4a82b328f3a
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+import re
+from ..builder.msprobe_adapter import op_patterns
+
+
+class NodeOp(Enum):
+    module = 0
+    function_api = 1
+
+    @staticmethod
+    def get_node_op(node_name: str):
+        """
+        基于代表节点的字符串，解析节点种类
+        """
+        for op in NodeOp:
+            index = op.value
+            if index < 0 or index >= len(op_patterns):
+                raise Exception("NodeOp and op_patterns in MsprobeAdapter do not match")
+            pattern = op_patterns[index]
+            if re.match(pattern, node_name):
+                return op
+        raise Exception(f"Cannot parse node_name {node_name} into NodeOp")
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/test.py b/debug/accuracy_tools/msprobe/pytorch/visualization/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..165d54ce17ed295308c7fa52b4dc5251271453a8
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/test.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import shutil
+import filecmp
+from .compare.graph_comparator import GraphComparator
+from .utils import GraphConst
+from .builder.graph_builder import GraphBuilder
+from ...pytorch.common.log import logger
+from ...core.common.file_check import create_directory
+
+
+def compare_graph(dump_path_n, dump_path_b, out_path):
+    # 对两个数据进行构图
+    construct_path_n = os.path.join(dump_path_n, GraphConst.CONSTRUCT_FILE)
+    construct_path_b = os.path.join(dump_path_b, GraphConst.CONSTRUCT_FILE)
+    data_path_n = os.path.join(dump_path_n, GraphConst.DUMP_FILE)
+    data_path_b = os.path.join(dump_path_b, GraphConst.DUMP_FILE)
+    graph_n = GraphBuilder.build(construct_path_n, data_path_n, 'TestNet')
+    graph_b = GraphBuilder.build(construct_path_b, data_path_b, 'TestNet')
+    # 基于graph、stack和data进行比较
+    stack_path = os.path.join(dump_path_n, GraphConst.STACK_FILE)
+    graph_comparator = GraphComparator([graph_n, graph_b], [data_path_n, data_path_b], stack_path, out_path)
+    graph_comparator.compare()
+    output_path = os.path.join(out_path, 'compare.vis')
+    GraphBuilder.to_json(output_path, graph_n, graph_b, graph_comparator.ma.get_tool_tip())
+
+
+def build_graph(dump_path, out_path):
+    construct_path = os.path.join(dump_path, GraphConst.CONSTRUCT_FILE)
+    data_path = os.path.join(dump_path, GraphConst.DUMP_FILE)
+    output_path = os.path.join(out_path, 'build.vis')
+    graph = GraphBuilder.build(construct_path, data_path, 'TestNet')
+    GraphBuilder.to_json(output_path, graph)
+
+
+def run_st(data_path):
+    start_time = time.time()
+    run_bench(data_path, 'output2')
+    end_time = time.time()
+    logger.info(f'run_st time cost: {end_time - start_time}')
+    # 比较output2的结果和output1 的bench结果差距
+    for data_dir in os.listdir(data_path):
+        data_dir = os.path.join(data_path, data_dir)
+        if not os.path.isdir(data_dir):
+            continue
+        output1 = os.path.join(data_dir, 'output1')
+        output2 = os.path.join(data_dir, 'output2')
+        files = ['build.vis', 'compare.vis']
+        for vis_file in files:
+            file1 = os.path.join(output1, vis_file)
+            file2 = os.path.join(output2, vis_file)
+            result = filecmp.cmp(file1, file2)
+            if result:
+                logger.info('pass ' + file1)
+            else:
+                logger.info('not pass ' + file1)
+
+
+def run_bench(data_path, output_dir):
+    for data_dir in os.listdir(data_path):
+        data_dir = os.path.join(data_path, data_dir)
+        if not os.path.isdir(data_dir):
+            continue
+        run_data_path = os.path.join(data_dir, 'data')
+        output_path = os.path.join(data_dir, output_dir)
+        if os.path.exists(output_path):
+            shutil.rmtree(output_path)
+        create_directory(output_path)
+        build_graph(run_data_path, output_path)
+        compare_graph(run_data_path, run_data_path, output_path)
diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb046f9758686fe810a05b1a23d76880b86bb994
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from ...core.common.file_check import FileOpen
+from ..compare.acc_compare import result_to_csv
+
+
+def load_json_file(file_path):
+    """
+    加载json文件
+    """
+    try:
+        with FileOpen(file_path, 'r') as f:
+            file_dict = json.load(f)
+            if not isinstance(file_dict, dict):
+                return {}
+            return file_dict
+    except json.JSONDecodeError:
+        return {}
+
+
+def load_data_json_file(file_path):
+    """
+    加载dump.json中的data字段
+    """
+    return load_json_file(file_path).get(GraphConst.DATA_KEY, {})
+
+
+def save_json_file(file_path, data):
+    """
+    保存json文件
+    """
+    with FileOpen(file_path, 'w') as f:
+        f.write(json.dumps(data, indent=4))
+
+
+def get_csv_df(md5_compare, summary_compare, stack, csv_data):
+    """
+    调用acc接口写入csv
+    """
+    return result_to_csv(md5_compare, summary_compare, stack, csv_data, None)
+
+
+def str2float(percentage_str):
+    """
+    百分比字符串转换转换为浮点型
+    Args:
+        percentage_str: '0.00%', '23.4%'
+    Returns: float 0.00, 0.234
+    """
+    try:
+        percentage_str = percentage_str.strip('%')
+        return float(percentage_str) / 100
+    except ValueError:
+        return 0
+
+
+class ToolTip:
+    MAX_DIFF = 'NPU与标杆API统计信息比对，最大值的差值'
+    MIN_DIFF = 'NPU与标杆API统计信息比对，最小值的差值'
+    MEAN_DIFF = 'NPU与标杆API统计信息比对，平均值的差值'
+    NORM_DIFF = 'NPU与标杆API统计信息比对，2范数（平方根）的差值'
+    MD5 = '数据MD5信息，用于比较两个数据信息是否完全一致'
+    ONE_THOUSANDTH_ERR_RATIO = 'Tensor中的元素逐个与对应的标杆数据对比，相对误差大于千分之一的比例占总元素个数的比例小于千分之一'
+    COSINE = '通过计算两个向量的余弦值来判断其相似度，数值越接近于1说明计算出的两个张量越相似，实际可接受阈值为大于0.99。在计算中可能会存在nan，主要由于可能会出现其中一个向量为0'
+    MAX_ABS_ERR = '当最大绝对误差越接近0表示其计算的误差越小，实际可接受阈值为小于0.001'
+    MAX_RELATIVE_ERR = '当最大相对误差越接近0表示其计算的误差越小。当dump数据中存在0或Nan时，比对结果中最大相对误差则出现inf或Nan的情况，属于正常现象'
+    SMALL_VALUE_TIP = '{} 小于1e-3，不计算相对误差'
+
+
+class Suggestions:
+    Module = '此模块精度比对结果疑似异常，请使用ptdbg工具对模块中的api进行dump比对'
+    API = '此api精度比对结果疑似异常，请使用api accuracy checker工具对api进行精度检测'
+    PTDBG = 'ptdbg工具'
+    PTDBG_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend'
+    API_ACCURACY_CHECKER = 'api accuracy checker工具'
+    API_ACCURACY_CHECKER_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker'
+
+
+class GraphConst:
+    CONSTRUCT_FILE = 'construct.json'
+    DUMP_FILE = 'dump.json'
+    STACK_FILE = 'stack.json'
+    GRAPH_FILE = 'graph.vis'
+    ERROR_KEY = 'error_key'
+    SUMMARY_COMPARE = 0
+    MD5_COMPARE = 1
+    REAL_DATA_COMPARE = 2
+    JSON_NPU_KEY = 'NPU'
+    JSON_BENCH_KEY = 'Bench'
+    JSON_TIP_KEY = 'Tooltip'
+    JSON_MD5_KEY = 'md5 Compare Result'
+    JSON_ROOT_KEY = 'root'
+    JSON_NODE_KEY = 'node'
+    DATA_KEY = 'data'
+    REAL_DATA_TH = 0.1
+    MAX_RELATIVE_ERR_TH = 0.5
+    ROUND_TH = 6
+    JSON_STATUS_KEY = 'precision_status'
+    JSON_INDEX_KEY = 'precision_index'
+    SUGGEST_KEY = 'text'
+    TAG_NA = 'na'
+    OUTPUT_INDEX = -2
+    STR_MAX_LEN = 50
+    SMALL_VALUE = 1e-3
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_graph_builder.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_graph_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..66eceea4b2a1ccf48ac95491c1a2cdca718a403a
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_graph_builder.py
@@ -0,0 +1,52 @@
+import unittest
+from unittest.mock import MagicMock, patch
+from msprobe.pytorch.visualization.builder.graph_builder import GraphBuilder, Graph
+
+
+class TestGraphBuilder(unittest.TestCase):
+
+    def setUp(self):
+        self.construct_path = "step/rank/construct.json"
+        self.data_path = "step/rank/dump.json"
+        self.model_name = "TestModel"
+        self.graph = Graph(self.model_name)
+        self.construct_dict = {
+            "Tensor1": "Module1",
+            "Module1": None
+        }
+        self.data_dict = {
+            "Module1": {"data": "data for Module1"},
+            "Tensor1": {"data": "data for Tensor1"}
+        }
+
+    @patch('msprobe.pytorch.visualization.builder.graph_builder.load_json_file')
+    @patch('msprobe.pytorch.visualization.builder.graph_builder.load_data_json_file')
+    def test_build(self, mock_load_data_json_file, mock_load_json_file):
+        mock_load_data_json_file.return_value = self.data_dict
+        mock_load_json_file.return_value = self.construct_dict
+
+        graph = GraphBuilder.build(self.construct_path, self.data_path, self.model_name)
+        self.assertIsNotNone(graph)
+        self.assertIsInstance(graph, Graph)
+        self.assertEqual(len(graph.node_map), 3)
+
+    @patch('msprobe.pytorch.visualization.builder.graph_builder.save_json_file')
+    def test_to_json(self, mock_save_json_file):
+        GraphBuilder.to_json("step/rank/output.vis", self.graph)
+        mock_save_json_file.assert_called_once()
+
+    @patch('msprobe.pytorch.visualization.graph.node_op.NodeOp.get_node_op')
+    @patch('msprobe.pytorch.visualization.builder.msprobe_adapter.get_input_output', return_value=([], []))
+    def test__init_nodes(self, mock_get_input_output, mock_get_node_op):
+        GraphBuilder._init_nodes(self.graph, self.construct_dict, self.data_dict)
+        mock_get_node_op.assert_any_call("Tensor1")
+        mock_get_node_op.assert_any_call("Module1")
+        self.assertIs(self.graph.root, self.graph.get_node("TestModel"))
+
+    def test__create_or_get_node(self):
+        node_op = MagicMock()
+        data_dict = {"node1": {}}
+        node = GraphBuilder._create_or_get_node(self.graph, data_dict, node_op, "node1")
+        self.assertIn("node1", self.graph.node_map)
+        self.assertEqual(node.input_data, {})
+        self.assertEqual(node.output_data, {})
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_msprobe_adapter.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_msprobe_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..12ae24279fd2af433d34f0cd3929eb075e209a49
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_msprobe_adapter.py
@@ -0,0 +1,73 @@
+import unittest
+from unittest.mock import patch
+from msprobe.pytorch.visualization.builder.msprobe_adapter import (
+    get_compare_mode,
+    run_real_data,
+    get_input_output,
+    compare_data,
+    format_node_data,
+    compare_node,
+    _format_decimal_string,
+    _format_data
+)
+from msprobe.pytorch.visualization.utils import GraphConst
+
+
+class TestMsprobeAdapter(unittest.TestCase):
+    @patch('msprobe.pytorch.visualization.builder.msprobe_adapter.task_dumppath_get', return_value=(True, False))
+    def test_get_compare_mode_summary(self, mock_task_dumppath_get):
+        mode = get_compare_mode("dummy_param")
+        self.assertEqual(mode, GraphConst.SUMMARY_COMPARE)
+
+    @patch('msprobe.pytorch.visualization.builder.msprobe_adapter._do_multi_process')
+    def test_run_real_data(self, mock_do_multi_process):
+        run_real_data("dump_path", "csv_path")
+        mock_do_multi_process.assert_called_once_with("dump_path", "csv_path")
+
+    def test_get_input_output(self):
+        node_data = {
+            'input_args': [{'type': 'torch.Tensor', 'dtype': 'torch.int64', 'shape': [5],
+                            'Max': 2049.0, 'Min': 0.0, 'Mean': 410.20001220703125, 'Norm': 2049.0009765625,
+                            'requires_grad': False, 'full_op_name': 'Distributed.broadcast.0.forward_input.0'},
+                           {'type': 'int', 'value': 0}],
+            'input_kwargs': {'group': None},
+            'output': [{'type': 'torch.Tensor', 'dtype': 'torch.int64', 'shape': [5],
+                        'Max': 2049.0, 'Min': 0.0, 'Mean': 410.20001220703125, 'Norm': 2049.0009765625,
+                        'requires_grad': False, 'full_op_name': 'Distributed.broadcast.0.forward_output.0'},
+                       {'type': 'int', 'value': 0}, None]
+        }
+        node_id = "Distributed.broadcast.0.forward"
+        input_data, output_data = get_input_output(node_data, node_id)
+        self.assertIn("Distributed.broadcast.0.forward_output.0", output_data)
+        self.assertIn("Distributed.broadcast.0.forward_input.0", input_data)
+
+    def test_compare_data(self):
+        data_dict_list1 = {'key1': {'type': 'Type1', 'dtype': 'DType1', 'shape': 'Shape1'}}
+        data_dict_list2 = {'key1': {'type': 'Type1', 'dtype': 'DType1', 'shape': 'Shape1'}}
+        self.assertTrue(compare_data(data_dict_list1, data_dict_list2))
+
+    def test_format_node_data(self):
+        data_dict = {'node1': {'data_name': 'data1', 'full_op_name': 'op1'}}
+        result = format_node_data(data_dict)
+        self.assertNotIn('data_name', result['node1'])
+        self.assertNotIn('requires_grad', result['node1'])
+
+    @patch('msprobe.pytorch.visualization.builder.msprobe_adapter.get_accuracy')
+    def test_compare_node(self, mock_get_accuracy):
+        node_ids = ["node1", "node2"]
+        data_dicts = [{'node1': {"input_args": [], "input_kwargs": {}, "output": {}}},
+                      {'node2': {"input_args": [], "input_kwargs": {}, "output": {}}}]
+        stack_json_data = {}
+        result = compare_node(node_ids, data_dicts, stack_json_data, False, False)
+        mock_get_accuracy.assert_called_once()
+        self.assertIsInstance(result, list)
+
+    def test__format_decimal_string(self):
+        s = "0.123456789%"
+        formatted_s = _format_decimal_string(s)
+        self.assertIn("0.123457%", formatted_s)
+
+    def test__format_data(self):
+        data_dict = {'value': 0.123456789}
+        _format_data(data_dict)
+        self.assertEqual(data_dict['value'], '0.123457')
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_graph_comparator.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_graph_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bece5380f04836a232a8c154a606c1cb68759b1c
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_graph_comparator.py
@@ -0,0 +1,32 @@
+import unittest
+from unittest.mock import patch
+from msprobe.pytorch.visualization.compare.graph_comparator import GraphComparator
+from msprobe.pytorch.visualization.graph.graph import Graph
+from msprobe.pytorch.visualization.utils import GraphConst
+
+
+class TestGraphComparator(unittest.TestCase):
+
+    def setUp(self):
+        self.graphs = [Graph("model1"), Graph("model2")]
+        self.data_paths = ["step1/rank/dump.json", "step2/rank/dump.json"]
+        self.stack_path = "step1/rank/stack.json"
+        self.output_path = "output/output.vis"
+
+    @patch('msprobe.pytorch.visualization.compare.graph_comparator.get_compare_mode')
+    @patch('msprobe.pytorch.visualization.compare.graph_comparator.load_json_file')
+    @patch('msprobe.pytorch.visualization.compare.graph_comparator.load_data_json_file')
+    def test__parse_param(self, mock_load_data_json_file, mock_load_json_file, mock_get_compare_mode):
+        mock_load_data_json_file.return_value = "data_dict"
+        mock_load_json_file.return_value = "construct_dict"
+        mock_get_compare_mode.return_value = GraphConst.SUMMARY_COMPARE
+        self.comparator = GraphComparator(self.graphs, self.data_paths, self.stack_path, self.output_path)
+        self.comparator._parse_param(self.data_paths, self.stack_path, self.output_path)
+
+        self.assertEqual(self.comparator.dump_path_param, {
+            'npu_json_path': self.data_paths[0],
+            'bench_json_path': self.data_paths[1],
+            'stack_json_path': self.stack_path,
+            'is_print_compare_log': True
+        })
+        self.assertEqual(self.comparator.output_path, self.output_path)
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_mode_adapter.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_mode_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7883a09a34115132ac2b8b217de434e32e58c279
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_mode_adapter.py
@@ -0,0 +1,61 @@
+import unittest
+from unittest.mock import patch, MagicMock
+from msprobe.pytorch.visualization.compare.mode_adapter import ModeAdapter
+from msprobe.pytorch.visualization.graph.base_node import BaseNode, NodeOp
+from msprobe.pytorch.visualization.utils import GraphConst, ToolTip
+from msprobe.core.common.const import CompareConst
+
+
+class TestModeAdapter(unittest.TestCase):
+
+    def setUp(self):
+        self.node_op = NodeOp.module
+        self.node_id = "node_1"
+        self.node = BaseNode(self.node_op, self.node_id)
+        self.compare_mode = GraphConst.REAL_DATA_COMPARE
+        self.adapter = ModeAdapter(self.compare_mode)
+        self.compare_data_dict = [{}, {}]
+
+    def test_add_md5_compare_data(self):
+        node_data = {'md5_key': 'some_md5_value'}
+        compare_data_dict = {'md5_key': 'expected_md5_value'}
+        precision_status = ModeAdapter._add_md5_compare_data(node_data, compare_data_dict)
+        self.assertTrue(precision_status)
+
+    @patch('msprobe.pytorch.visualization.compare.mode_adapter.ModeAdapter')
+    def test_parse_result(self, mock_mode_adapter):
+        mock_mode_adapter._add_summary_compare_data.return_value = (True, 0.5)
+        self.adapter.compare_mode = GraphConst.SUMMARY_COMPARE
+        precision_status, precision_index, other_dict = self.adapter.parse_result(
+            self.node, self.compare_data_dict)
+        self.assertEqual(precision_status, True)
+        self.assertEqual(precision_index, 0.5)
+        self.assertEqual(other_dict, {})
+
+    def test_prepare_real_data(self):
+        self.adapter.is_real_data_compare = MagicMock(return_value=True)
+        result = self.adapter.prepare_real_data(self.node)
+        self.assertTrue(result)
+
+    def test_compare_mode_methods(self):
+        self.adapter.compare_mode = GraphConst.SUMMARY_COMPARE
+        self.assertTrue(self.adapter.is_summary_compare())
+        self.assertFalse(self.adapter.is_md5_compare())
+        self.assertFalse(self.adapter.is_real_data_compare())
+
+    def test_add_csv_data(self):
+        compare_result_list = ['result1', 'result2']
+        self.adapter.add_csv_data(compare_result_list)
+        self.assertEqual(self.adapter.csv_data, compare_result_list)
+
+    def test_add_error_key(self):
+        node_data = {'key': {}}
+        self.adapter.compare_mode = GraphConst.REAL_DATA_COMPARE
+        self.adapter.add_error_key(node_data)
+        self.assertEqual(node_data['key'][GraphConst.ERROR_KEY],
+                         [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO])
+
+    def test_get_tool_tip(self):
+        self.adapter.compare_mode = GraphConst.MD5_COMPARE
+        tips = self.adapter.get_tool_tip()
+        self.assertEqual(tips, {'md5': ToolTip.MD5})
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_base_node.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_base_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..544950f35881e19eb449a138a4b0937ca91eb1d7
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_base_node.py
@@ -0,0 +1,64 @@
+import unittest
+from msprobe.pytorch.visualization.graph.base_node import BaseNode, NodeOp
+from msprobe.pytorch.visualization.utils import GraphConst
+
+
+class TestBaseNode(unittest.TestCase):
+
+    def setUp(self):
+        self.node_op = NodeOp.module
+        self.node_id = "node_1"
+        self.up_node = BaseNode(self.node_op, "up_node_1")
+        self.node = BaseNode(self.node_op, self.node_id, self.up_node)
+
+    def test_init_and_str(self):
+        self.assertEqual(self.node.op, self.node_op)
+        self.assertEqual(self.node.id, self.node_id)
+        self.assertEqual(str(self.node), 'id:\tnode_1')
+
+    def test_eq(self):
+        other_node = BaseNode(self.node_op, self.node_id, self.up_node)
+        self.assertEqual(self.node, other_node)
+
+    def test_get_suggestions(self):
+        self.node.get_suggestions()
+        self.assertIn(GraphConst.SUGGEST_KEY, self.node.suggestions)
+
+    def test_set_input_output(self):
+        input_data = {'input1': 'value1'}
+        output_data = {'output1': 'value2'}
+        self.node.set_input_output(input_data, output_data)
+        self.assertEqual(self.node.input_data, input_data)
+        self.assertEqual(self.node.output_data, output_data)
+
+    def test_add_upnode(self):
+        self.node = BaseNode(self.node_op, self.node_id)
+        new_up_node = BaseNode(self.node_op, "new_up_node_1")
+        self.node.add_upnode(new_up_node)
+        self.assertEqual(self.node.upnode, new_up_node)
+        self.assertIn(self.node, new_up_node.subnodes)
+
+    def test_add_link(self):
+        other_node = BaseNode(self.node_op, "other_node_1")
+        ancestors = ['a1', 'a2']
+        self.node.add_link(other_node, ancestors)
+        self.assertEqual(self.node.matched_node_link, ancestors)
+        self.assertEqual(other_node.matched_node_link, ancestors)
+
+    def test_to_dict(self):
+        expected_result = {
+            'id': self.node_id,
+            'node_type': self.node_op.value,
+            'data': {},
+            'output_data': {},
+            'input_data': {},
+            'upnode': self.up_node.id,
+            'subnodes': [],
+            'matched_node_link': [],
+            'suggestions': {}
+        }
+        self.assertEqual(self.node.to_dict(), expected_result)
+
+    def test_get_ancestors(self):
+        expected_ancestors = ['up_node_1']
+        self.assertEqual(self.node.get_ancestors(), expected_ancestors)
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_graph.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d098743458a61d13146b6da1b65098f90171b7
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_graph.py
@@ -0,0 +1,50 @@
+import unittest
+from msprobe.pytorch.visualization.graph.graph import Graph, NodeOp
+from msprobe.pytorch.visualization.graph.base_node import BaseNode
+from msprobe.pytorch.visualization.utils import GraphConst
+
+
+class TestGraph(unittest.TestCase):
+
+    def setUp(self):
+        self.graph = Graph("model_name")
+        self.node_id = "node_id"
+        self.node_op = NodeOp.module
+
+    def test_add_node_and_get_node(self):
+        self.graph.add_node(self.node_op, self.node_id)
+        node = self.graph.get_node(self.node_id)
+        self.assertIsNotNone(node)
+        self.assertIn(self.node_id, self.graph.node_map)
+
+    def test_to_dict(self):
+        self.graph.add_node(self.node_op, self.node_id)
+        result = self.graph.to_dict()
+        self.assertEqual(result[GraphConst.JSON_ROOT_KEY], "model_name")
+        self.assertIn(self.node_id, result[GraphConst.JSON_NODE_KEY])
+
+    def test_str(self):
+        self.graph.add_node(self.node_op, self.node_id)
+        expected_str = f'{self.node_id}'
+        self.assertIn(expected_str, str(self.graph))
+
+    def test_match(self):
+        graph_a = Graph("model_name_a")
+        graph_b = Graph("model_name_b")
+        node_a = BaseNode(self.node_op, self.node_id)
+        graph_a.add_node(NodeOp.module, "node_id_a")
+        graph_b.add_node(NodeOp.module, "node_id_b")
+        matched_node, ancestors = Graph.match(graph_a, node_a, graph_b)
+        self.assertIsNone(matched_node)
+        self.assertEqual(ancestors, [])
+
+    def test_dfs(self):
+        graph = Graph("model_name")
+        graph.add_node(NodeOp.module, "node_a")
+        graph.add_node(NodeOp.module, "node_b")
+        node_a = BaseNode(self.node_op, self.node_id)
+        result = {}
+        graph.dfs(node_a, result)
+        self.assertEqual(result, {'node_id': {'id': 'node_id', 'node_type': 0, 'data': {},
+                                              'output_data': {}, 'input_data': {}, 'upnode': 'None', 'subnodes': [],
+                                              'matched_node_link': [], 'suggestions': {}}})
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_node_op.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_node_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a340ac8b3c7144a9e07485c93e289a950eee8c7
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_node_op.py
@@ -0,0 +1,28 @@
+import unittest
+from msprobe.pytorch.visualization.graph.node_op import NodeOp
+
+
+class TestNodeOp(unittest.TestCase):
+
+    def test_get_node_op_valid(self):
+        node_name = "ModuleTest"
+        self.assertEqual(NodeOp.get_node_op(node_name), NodeOp.module)
+
+    def test_get_node_op_invalid(self):
+        node_name = "InvalidNodeName"
+        with self.assertRaises(Exception):
+            NodeOp.get_node_op(node_name)
+
+    def test_get_node_op_all(self):
+        test_cases = [
+            ("ModuleTest", NodeOp.module),
+            ("TensorTest", NodeOp.function_api),
+            ("TorchTest", NodeOp.function_api),
+            ("FunctionalTest", NodeOp.function_api),
+            ("NPUTest", NodeOp.function_api),
+            ("VFTest", NodeOp.function_api),
+            ("DistributedTest", NodeOp.function_api),
+            ("AtenTest", NodeOp.function_api)
+        ]
+        for node_name, expected_op in test_cases:
+            self.assertEqual(NodeOp.get_node_op(node_name), expected_op)
diff --git a/debug/accuracy_tools/ptdbg_ascend/__init__.py b/debug/accuracy_tools/ptdbg_ascend/__init__.py
index 92fb62c7566d9b3da72e9dfb87486dc361a3919e..0b8a5f1dd3494f2353d140e02fba58455a2e15c2 100644
--- a/debug/accuracy_tools/ptdbg_ascend/__init__.py
+++ b/debug/accuracy_tools/ptdbg_ascend/__init__.py
@@ -14,4 +14,3 @@
 # limitations under the License.
 
 import torch
-from ptdbg_ascend.src.python.ptdbg_ascend import *
diff --git a/debug/accuracy_tools/setup.py b/debug/accuracy_tools/setup.py
index 4e0eaa1f3754f150006a8d656dc26d81b6ccea1a..9d4356171d9207eb803771e638461e43dc560559 100644
--- a/debug/accuracy_tools/setup.py
+++ b/debug/accuracy_tools/setup.py
@@ -29,7 +29,7 @@ INSTALL_REQUIRED = [
 EXCLUDE_PKGS = [
     "api_accuracy_checker*",
     "grad_tool*",
-    "kj600*",
+    "monitor*",
     "ptdbg_ascend*",
     "msprobe.test*",
 ]
diff --git a/profiler/README.md b/profiler/README.md
index 1669e3524e54bb78e6f4f09f597d2399196ff950..549ffefc14cebbeb6730ab23f4df980bc33dfbfa 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -91,6 +91,7 @@ ascend pytorch profiler数据目录结构如下：
 
    | profiler版本 | 发布日期   | 下载链接                                                     | 校验码                                                       |
    | ------------ | ---------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+   | 1.2.0        | 2024-07-25 | [msprof_analyze-1.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.2.0/msprof_analyze-1.2.0-py3-none-any.whl) | 6a4366e3beca40b4a8305080e6e441d6ecafb5c05489e5905ac0265787555f37 |
    | 1.1.2        | 2024-07-12 | [msprof_analyze-1.1.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.2/msprof_analyze-1.1.2-py3-none-any.whl) | af62125b1f9348bf491364e03af712fc6d0282ccee3fb07458bc9bbef82dacc6 |
    | 1.1.1        | 2024-06-20 | [msprof_analyze-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.1/msprof_analyze-1.1.1-py3-none-any.whl) | 76aad967a3823151421153d368d4d2f8e5cfbcb356033575e0b8ec5acea8e5e4 |
    | 1.1.0        | 2024-05-28 | [msprof_analyze-1.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.0/msprof_analyze-1.1.0-py3-none-any.whl) | b339f70e7d1e45e81f289332ca64990a744d0e7ce6fdd84a8d82e814fa400698 |
diff --git a/profiler/advisor/README.md b/profiler/advisor/README.md
index c650f40b3ea8ef48b3c7644e279b00a1cb99f29a..77027110559de578d9339c3f5a3d6c762e72a6b5 100644
--- a/profiler/advisor/README.md
+++ b/profiler/advisor/README.md
@@ -36,11 +36,11 @@ msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的
 
 3. 查看结果。
 
-   分析结果输出相关简略建议到执行终端中，并生成`att_advisor_{timestamp}.html`和`att_advisor_{timestamp}.xlsx`文件供用户预览。
+   分析结果输出相关简略建议到执行终端中，并生成`mstt_advisor_{timestamp}.html`和`mstt_advisor_{timestamp}.xlsx`文件供用户预览。
    
-   `att_advisor_{timestamp}.xlsx`文件内容与执行终端输出一致。
+   `mstt_advisor_{timestamp}.xlsx`文件内容与执行终端输出一致。
    
-   `att_advisor_{timestamp}.html`文件分析详见“**报告解析**”。
+   `mstt_advisor_{timestamp}.html`文件分析详见“**报告解析**”。
    
    执行终端输出示例如下：
    
@@ -72,6 +72,7 @@ msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的
 |            | block_dim_analysis         | block dim算子调优                        |
 |            | operator_no_bound_analysis | operator no bound                        |
 |            | graph                      | 融合算子图调优                           |
+|            | freq_analysis              | AI Core算子降频分析                      |
 | scheduling | timeline_fusion_ops        | 亲和API替换调优                          |
 |            | timeline_op_dispatch       | 识别算子下发问题(路径3/路径5)            |
 
@@ -132,6 +133,8 @@ cluster模块的分析包含快慢卡和快慢链路分析，仅识别问题，
 
 overall模块的分析包含当前训练任务慢卡的性能拆解，按照计算、通信和下发三个维度进行耗时的统计，可以基于该分析识别到训练性能瓶颈是计算、通信还是下发问题，同样不提供调优建议。
 
+![输入图片说明](./img/overall_0.png)
+
 ![输入图片说明](./img/overall.png)
 
 schedule模块包含亲和API、aclOpCompile、syncBatchNorm、SynchronizeStream等多项检测。
@@ -152,7 +155,7 @@ torch_npu.npu.config.allow_internal_format = False
 
 ![schedule_3](./img/schedule_3.png)
 
-computation模块从device计算性能维度进行分析，能够识别AI CPU、计算bound、动态Shape等问题并给出相应建议。此处不再详细展开，按照报告进行调优即可。
+computation模块从device计算性能维度进行分析，能够识别AI CPU、计算bound、动态Shape、AI Core算子降频分析等问题并给出相应建议。此处不再详细展开，按照报告进行调优即可。
 
 ![computation_1](./img/computation_1.png)
 
diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py
index e0e17320b3309ed24cfc7f45d6b09f73501be7da..ada1b0bf4f4c8344c8830fe446c8d05dd583eac5 100644
--- a/profiler/advisor/analyzer/base_analyzer.py
+++ b/profiler/advisor/analyzer/base_analyzer.py
@@ -73,14 +73,6 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta):
     def optimize(self, **kwargs):
         pass
 
-    @abstractmethod
-    def make_record(self):
-        pass
-
-    @abstractmethod
-    def make_render(self):
-        pass
-
     def init_dataset_list(self)->None:
         dataset_cls_list = self.dataset_cls_list
         if len(dataset_cls_list) == 0:
diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/__init__.py b/profiler/advisor/analyzer/computation/ai_core_freq/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f25deff7c0cdb415ccae6ab748304d4044c5eec
--- /dev/null
+++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py
@@ -0,0 +1,36 @@
+import logging
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.analyzer.computation.ai_core_freq.ai_core_freq_checker import AICoreFreqChecker
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.dataset.ai_core_freq.ai_core_freq_dataset import AICoreFreqDataset
+from profiler.advisor.config.config import Config
+
+logger = logging.getLogger()
+
+
+class AICoreFreqAnalyzer(BaseAnalyzer):
+    dataset_cls_list = [AICoreFreqDataset]
+
+    def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None:
+        super().__init__(collection_path, n_processes, **kwargs)
+        key = AICoreFreqDataset.get_key()
+        self.dataset = self.get_first_data_by_key(self.dataset_list, key)
+        self.result = OptimizeResult()
+        self.html_render = HTMLRender()
+        self.html = None
+
+    @BaseAnalyzer.check_data((AICoreFreqDataset.get_key(),))
+    def optimize(self, **kwargs):
+        if not Config().get_config("aic_frequency"):
+            logger.warning("Can not find ai core frequency in info.json*, please check data integrity.")
+            return self.result
+        add_render_list = kwargs.get("add_render_list", True)
+        ai_core_freq_checker = AICoreFreqChecker()
+        ai_core_freq_checker.check_ai_core_freq(self.dataset)
+        if not ai_core_freq_checker.ai_core_freq_issues:
+            return self.result
+        ai_core_freq_checker.make_record(self.result)
+        self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list)
+        return self.result
diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ea4dbd7542750469967b05ab9a738f2d70600e4
--- /dev/null
+++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py
@@ -0,0 +1,100 @@
+import logging
+
+from profiler.advisor.dataset.ai_core_freq.ai_core_freq_dataset import AICoreFreqDataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.config.config import Config
+from profiler.advisor.utils.utils import convert_to_float
+
+logger = logging.getLogger()
+
+
+class AICoreFreqChecker:
+    DEFAULT_FREQ = 1800
+    DECREASE_FREQ_RATIO = 0.05
+    SHOW_TOPK_OPS = 10
+    TOTAL_DURATION_INDEX = 2
+    DECREASE_FREQ_RATIO_INDEX = 3
+
+    def __init__(self):
+
+        self.ai_core_freq_issues = False
+        self.desc = ""
+        self.suggestions = ""
+        self.decrease_freq_ops = []
+        self.headers = []
+        self.op_freq = None
+        self.rank_id = None
+        self.stage = None
+
+    def check_ai_core_freq(self, event_dataset: AICoreFreqDataset, rank_id=None, stage=None):
+        """
+        :Param event_dataset: dataset of timeline event
+        """
+        if not hasattr(event_dataset, "op_freq") or not getattr(event_dataset, "op_freq"):
+            logger.debug("Skip slow ai core frequency checker, "
+                         "because no ai core frequency were recorded in trace_view.json")
+            return
+
+        self.rank_id = rank_id
+        self.stage = stage
+        self.op_freq = event_dataset.op_freq
+        for op_name, op_info in self.op_freq.items():
+            freq_list = op_info.get("freq_list", [])
+            if not freq_list:
+                continue
+
+            op_count = op_info.get("count", 0)
+            op_total_duration = round(op_info.get("dur", 0), 2)
+            max_freq = max(self.DEFAULT_FREQ, convert_to_float(Config().get_config("aic_frequency")))
+
+            decrease_freq_ratio = sum(max_freq - freq for freq in freq_list) / (max_freq * len(freq_list))
+            if decrease_freq_ratio >= self.DECREASE_FREQ_RATIO:
+                self.ai_core_freq_issues = True
+                self.decrease_freq_ops.append([op_name, op_count, op_total_duration,
+                                               f"{round(decrease_freq_ratio, 4):.2%}",
+                                               round(sum(freq_list) / len(freq_list), 2),
+                                               max(freq_list), min(freq_list)])
+
+        if self.decrease_freq_ops:
+            # 按算子总耗时和降频比率 降序排列
+            self.decrease_freq_ops.sort(key=
+                                        lambda x: (x[self.TOTAL_DURATION_INDEX], x[self.DECREASE_FREQ_RATIO_INDEX]),
+                                        reverse=True)
+
+        self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction "
+                     f"ratio is larger than {self.DECREASE_FREQ_RATIO}.")
+        if self.rank_id:
+            self.desc = f"For rank {self.rank_id}, " + self.desc.lower()
+        self.suggestions = "Please check the temperature or max power of your machine."
+
+    def make_record(self, result: OptimizeResult):
+        """
+        make record for what and how to optimize
+        """
+        optimization_item = OptimizeItem("AI Core Frequency", self.desc, [self.suggestions])
+        result.add(OptimizeRecord(optimization_item))
+
+        self.headers = ["Operator name", "Count", "Total duration(us)", "AI CORE frequency decreased ratio",
+                        "Average frequency", "Max frequency", "Min frequency"]
+        if self.rank_id:
+            self.headers = ["Rank id"] + self.headers
+        sub_table_name = "AI Core Frequency" if not self.stage else f"Stage-{self.stage}: AI Core Frequency"
+        result.add_detail(sub_table_name, headers=self.headers)
+
+        for row in self.decrease_freq_ops:
+            if self.rank_id:
+                row = [self.rank_id] + row
+            result.add_detail(sub_table_name, detail=row)
+
+    def make_render(self, html_render, add_render_list=True):
+        if self.SHOW_TOPK_OPS:
+            self.desc += f" Only show {self.SHOW_TOPK_OPS} operators here, see latest mstt_advisor.xlsx for details."
+        return html_render.render_template(key="computation",
+                                           template_dir="templates",
+                                           template_name="ai_core_frequency.html",
+                                           desc=self.desc,
+                                           suggestion=self.suggestions,
+                                           headers=self.headers,
+                                           data=self.decrease_freq_ops[:self.SHOW_TOPK_OPS],
+                                           add_render_list=add_render_list)
diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py
index 4eca1c6c0278349cf4068544d2a53d8de7f0d5e1..0caede4b894e0dda15333e6d3a480fa943c66323 100644
--- a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py
+++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py
@@ -3,13 +3,13 @@ import os
 from functools import partial
 from typing import List, Dict, Optional
 
-import yaml
 from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker, logger
 from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker import OpStackFinder
 from profiler.advisor.common import constant
 from profiler.advisor.dataset.dataset import Dataset
 from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
 from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.cluster_analyse.common_func.file_manager import FileManager
 
 
 class AicpuChecker(OperatorChecker):
@@ -47,8 +47,8 @@ class AicpuChecker(OperatorChecker):
         if not os.path.exists(rule_path):
             logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path)
             return {}
-        with open(rule_path, 'r') as f:
-            self.aicpu_rules = yaml.safe_load(f)
+
+        self.aicpu_rules = FileManager.read_yaml_file(rule_path)
         self.filter_aicpu_rules(self.aicpu_rules)
         for checker_name, check_rule in self.aicpu_rules.items():
             if not isinstance(check_rule, (list, dict,)):
diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py
index 8682617700702055628a31982b0eafab9feb336d..2021bcd5765d1df7489f202b3453a83924fb28dc 100644
--- a/profiler/advisor/analyzer/computation/profiling_analyzer.py
+++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py
@@ -1,19 +1,15 @@
 import logging
 from abc import ABC
-from typing import Dict, List
 
 from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
-from profiler.advisor.common import constant
 from profiler.advisor.result.result import OptimizeResult
 from profiler.advisor.analyzer.computation.aicpu.aicpu_checker import AicpuChecker
 from profiler.advisor.analyzer.computation.bound.block_dim_checker import BlockDimChecker
 from profiler.advisor.analyzer.computation.bound.operator_bound_checker import OperatorBoundChecker
-from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
 from profiler.advisor.analyzer.computation.op_compile.dynamic_shape_checker import DynamicShapeChecker
 from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
 from profiler.advisor.display.html.render import HTMLRender
 from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
-from profiler.advisor.utils.utils import get_supported_subclass
 
 logger = logging.getLogger()
 
@@ -76,14 +72,15 @@ class BlockDimAnalyzer(ProfilingAnalyzer):
     def __init__(self, collection_path, **kwargs) -> None:
         super().__init__(collection_path, **kwargs)
         self.checker = BlockDimChecker(self.cann_version)
-        
+
 
 class OperatorBoundAnalyzer(ProfilingAnalyzer):
     def __init__(self, collection_path, **kwargs) -> None:
         super().__init__(collection_path, **kwargs)
         self.checker = OperatorBoundChecker(self.cann_version)
 
+
 class AicpuAnalyzer(ProfilingAnalyzer):
     def __init__(self, collection_path, **kwargs) -> None:
         super().__init__(collection_path, **kwargs)
-        self.checker = AicpuChecker(self.cann_version)
\ No newline at end of file
+        self.checker = AicpuChecker(self.cann_version)
diff --git a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..291c3a1f941cf1934c0c91b7603b6270ee66f3fb
--- /dev/null
+++ b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py
@@ -0,0 +1,30 @@
+import logging
+
+from typing import List, Dict, Any
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.analyzer.dataloader.dataloader_checker import DataloaderChecker
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+
+logger = logging.getLogger()
+
+
+class DataloaderAnalyzer(BaseAnalyzer):
+    dataset_cls_list = [TimelineEventDataset]
+
+    def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None:
+        super().__init__(collection_path, n_processes, **kwargs)
+        key = TimelineEventDataset.get_key()
+        self.dataset = self.get_first_data_by_key(self.dataset_list, key)
+        self.result = OptimizeResult()
+        self.html_render = HTMLRender()
+
+    @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),))
+    def optimize(self, **kwargs):
+        dataloader_checker = DataloaderChecker()
+        dataloader_checker.check_slow_dataloader(self.dataset)
+        dataloader_checker.make_record(self.result)
+        dataloader_checker.make_render(self.html_render)
+        return self.result
diff --git a/profiler/advisor/analyzer/dataloader/dataloader_checker.py b/profiler/advisor/analyzer/dataloader/dataloader_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb1886284ef5d508f911d0c353df4574fd4a8bd3
--- /dev/null
+++ b/profiler/advisor/analyzer/dataloader/dataloader_checker.py
@@ -0,0 +1,84 @@
+import os
+import re
+import logging
+import yaml
+
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.cluster_analyse.common_func.file_manager import FileManager
+
+logger = logging.getLogger()
+
+
+class DataloaderChecker:
+
+    def __init__(self):
+
+        self.dataloader_issues = False
+        self.optimization_item = []
+        self.desc = ""
+        self.suggestions = []
+        self.dataloader_duration_threshold = None
+        self._init_rule()
+
+    def check_slow_dataloader(self, event_dataset: TimelineEventDataset):
+        """
+        :Param event_dataset: dataset of timeline event
+        """
+        if not hasattr(event_dataset, "dataloader") or not getattr(event_dataset, "dataloader"):
+            logger.debug("Skip slow dataloader checker, because no dataloader duration larger than %s",
+                         self.dataloader_duration_threshold)
+            return
+        for event in event_dataset.dataloader:
+
+            dataloader_duration = float(event.dur) / 1000
+            if dataloader_duration < self.dataloader_duration_threshold:
+                continue
+            self.desc = self.desc.format(dataloader_duration=dataloader_duration,
+                                         dataloader_duration_threshold=self.dataloader_duration_threshold)
+            self.dataloader_issues = True
+
+            if re.search("singleprocess", event.name.lower()):
+                self.suggestions = self._reset_suggestions(["I/O", "num_workers"])
+
+    def make_record(self, result: OptimizeResult):
+        """
+        make record for what and how to optimize
+        """
+        if not self.dataloader_issues:
+            return
+
+        self.optimization_item.append(OptimizeItem("Slow dataloader", self.desc, self.suggestions))
+        for optimization in self.optimization_item:
+            result.add(OptimizeRecord(optimization))
+
+    def make_render(self, html_render):
+        if not self.dataloader_issues:
+            return
+        html_render.render_template(key="dataloader",
+                                    template_dir="templates",
+                                    template_name="slow_dataloader.html",
+                                    desc=self.desc,
+                                    suggestions=self.suggestions)
+
+    def _init_rule(self):
+        dataloader_rule_path = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))),
+            "rules",
+            "dataloader.yaml"
+        )
+        dataloader_rule = FileManager.read_yaml_file(dataloader_rule_path)
+
+        self.dataloader_duration_threshold = dataloader_rule.get("dataloader_duration_threshold")
+        self.desc = dataloader_rule.get("problem")
+        self.suggestions = dataloader_rule.get("solutions")
+
+    def _reset_suggestions(self, suggestion_pattern_list):
+
+        suggestions = []
+        for solution in self.suggestions:
+            for suggestion_pattern in suggestion_pattern_list:
+                if re.search(suggestion_pattern, solution):
+                    suggestions.append(solution)
+        return suggestions
diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py
index e64020fdfe2ace37172e82ed562db1b66971d3d6..30bd4323795c28df8f476eafd2d43027b8682a32 100644
--- a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py
+++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py
@@ -149,7 +149,7 @@ class GraphFusionRules:
         optimization_item = OptimizeItem(
             "fusion issue",
             f"Found {len(self.candidates)} fusion issues",
-            ["Check fusion issues detail in att_advisor*.html"]
+            ["Check fusion issues detail in mstt_advisor*.html"]
         )
         total_time = 0.0
         for candidate in self.task_duration_list:
diff --git a/profiler/advisor/analyzer/overall/overall_analyzer.py b/profiler/advisor/analyzer/overall/overall_analyzer.py
deleted file mode 100644
index 916a396b3d096dc788954cbc8e8ba9755cd15f4e..0000000000000000000000000000000000000000
--- a/profiler/advisor/analyzer/overall/overall_analyzer.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import logging
-from typing import Dict, List
-
-from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
-from profiler.advisor.display.html.render import HTMLRender
-from profiler.advisor.result.result import OptimizeResult
-from profiler.compare_tools.compare_backend.utils.constant import Constant
-from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface
-
-logger = logging.getLogger()
-
-
-class OverallSummaryAnalyzer(BaseAnalyzer):
-
-    def __init__(self, profiling_path, benchmark_profiling_path=None, **kwargs):
-        self.benchmark_profiling_path = benchmark_profiling_path or profiling_path
-        self.profiling_path = profiling_path
-        self.html_render = HTMLRender()
-        self.result = OptimizeResult()
-
-    def optimize(self, **kwargs):
-        compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare(
-            Constant.OVERALL_COMPARE)
-
-        headers = compare_result.get('Model Profiling Time Distribution').get("headers", [])
-        rows = compare_result.get('Model Profiling Time Distribution').get("rows", [])
-
-        self.make_record()
-        self.make_render(headers=headers, rows=rows)
-        return compare_result
-
-    def make_record(self):
-        pass
-
-    def make_render(self, **kwargs):
-        headers = kwargs.get("headers")
-        rows = kwargs.get("rows")
-
-        if not headers or not rows:
-            logger.info("Empty headers or rows, skip render overall analysis html")
-        self.html_render.render_template(key="overall",
-                                         template_dir="templates",
-                                         template_name="overall_analysis.html",
-                                         headers=kwargs.get("headers"),
-                                         rows=kwargs.get("rows"))
diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py
index c74ae0510331fb9ba8a1794bd724710ba19cfabf..8e93dbda77d4915e716af856114184324d1d8807 100644
--- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py
+++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py
@@ -13,27 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-import copy
-
-import logging
-from typing import Dict, List
 
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
 from profiler.advisor.display.html.render import HTMLRender
-from profiler.advisor.result.result import OptimizeResult
 from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
-from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
 from profiler.compare_tools.compare_backend.utils.constant import Constant
-from profiler.advisor.common import constant as const
 from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface
-from profiler.advisor.utils.utils import get_file_path_from_directory, load_parameter
 
 
 class OverallSummaryAnalyzer(BaseAnalyzer):
     OVERALL_SUMMARY_ANALYZER = "overall_summary_analysis"
     advice_map = {
-        "Computing Time": "if you want more detailed advice please go to att_advisor_*.html",
-        "Uncovered Communication Time": "if you want more detailed advice please go to att_advisor_*.html",
-        "Free Time": "if you want more detailed advice please go to att_advisor_*.html"
+        "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html",
+        "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html",
+        "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html"
     }
     time_name_map = {
         "Computing Time": "computing",
@@ -47,45 +41,37 @@ class OverallSummaryAnalyzer(BaseAnalyzer):
         'SDMA Time(Num)': 'SDMA Time'
     }
     performance_time_dict = {
-        "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)',
-                           'Flash Attention Time(Backward)(Num)', 'Other Time'],
-        "Uncovered Communication Time(Wait Time)": [],
-        "Free Time": ['SDMA Time(Num)']
+        "Computing Time": "computing_time_ms",
+        "    -- Flash Attention": "fa_time_ms",
+        "    -- Conv": "conv_time_ms",
+        "    -- Matmul": "matmul_time_ms",
+        "    -- Vector": "vector_time_ms",
+        "    -- SDMA(Tensor Move)": "tensor_move_time_ms",
+        "    -- Other Cube": "other_cube_time_ms",
+        "Uncovered Communication Time": "uncovered_communication_time_ms",
+        "    -- Wait": "wait_time_ms",
+        "    -- Transmit": "transmit_time_ms",
+        "Free Time": "free_time_ms",
+        "    -- SDMA": "sdma_time_ms",
+        "    -- Free": "free_ms",
+        "E2E Time": "e2e_time_ms"
     }
 
     def __init__(self, collection_path: str, n_processes: int = 1, **kwargs):
         profile_path = get_profile_path(collection_path)
         super().__init__(profile_path, n_processes, **kwargs)
-        self.base_collection_path = kwargs.get("base_collection_path", "")
-        self._has_base_collection = False
+        self.benchmark_profiling_path = kwargs.get("benchmark_profiling_path", "")
+        self._has_benchmark_profiling = False
         self._is_minimal_profiling = False
         self.cur_data = {}
-        self.cur_data_table = {}
         self.cur_bottleneck = {}
+        self._disaggregate_perf = {}
+        self._disaggregate_benchmark_perf = {}
         self.cur_advices = ""
-        self._headers = []
-        self._base_data = []
-        self._comparison_data = []
         self.html_render = HTMLRender()
         self.result = OptimizeResult()
         self.bottleneck_str = ""
-        self.bottleneck_table = {}
-
-    @staticmethod
-    def split_duration_and_num(time_value: str) -> tuple:
-        split_data = time_value.split("s")  # time value example: 0.229s(1756)
-        duration, num = 0.0, None
-        if len(split_data) >= 2:
-            try:
-                num = int(split_data[1].strip("()"))
-            except ValueError:
-                pass
-        if len(split_data) >= 1:
-            try:
-                duration = float(split_data[0])
-            except ValueError:
-                print(f"[WARNING] Invalid time value: {time_value}.")
-        return duration, num
+        self.over_summary_analysis = {}
 
     @staticmethod
     def calculate_ratio(dividend, divisor):
@@ -93,131 +79,121 @@ class OverallSummaryAnalyzer(BaseAnalyzer):
             return float("inf")
         return dividend / divisor
 
+    @staticmethod
+    def get_time_category_dict(overall_dict: dict):
+        time_category_dict = {
+            "Computing Time": round(overall_dict.get('computing_time_ms', 0.0), 3),
+            "Uncovered Communication Time": round(overall_dict.get('uncovered_communication_time_ms', 0.0), 3),
+            "Free Time": round(overall_dict.get('free_time_ms', 0.0), 3)
+        }
+        return time_category_dict
+
     def path_check(self):
-        if self.base_collection_path:
-            if os.path.exists(self.base_collection_path):
-                self._has_base_collection = True
+        if self.benchmark_profiling_path:
+            if os.path.exists(self.benchmark_profiling_path):
+                self._has_benchmark_profiling = True
             else:
-                print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.")
+                print(f"[WARNING] Invalid path which not exists: {self.benchmark_profiling_path}.")
         return os.path.exists(self.collection_path)
 
     def process(self):
-        base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path
-        result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE)
-        for data in result_data.values():
-            self._headers = data.get("headers", [])
-            rows = data.get("rows", [])
-            if len(rows) == 2:
-                self._base_data = rows[0]
-                self._comparison_data = rows[1]
-        if not self._headers or not self._comparison_data:
+        self._disaggregate_perf = ComparisonInterface(self.collection_path).disaggregate_perf(Constant.OVERALL_COMPARE)
+        if not self._disaggregate_perf:
             return
-        self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers
-        if self._has_base_collection:
-            self.cur_data["comparison_result"] = result_data
-        time_category_dict = {}
-        for time_category, time_list in self.performance_time_dict.items():
-            time_value = self.get_time_value(time_category, self._comparison_data)
-            if time_value == Constant.INVALID_VALUE:
-                continue
-            duration, _ = self.split_duration_and_num(time_value)
-            time_category = time_category.split("(")[0]
-            time_category_dict[time_category] = duration
-            self.get_sub_category_time(time_category, time_list, duration)
-        self.cur_data["overall_data"] = time_category_dict
-
-    def get_time_value(self, header_name: str, data_list: list):
-        try:
-            data_index = self._headers.index(header_name)
-        except ValueError:
-            return Constant.INVALID_VALUE
-        try:
-            time_value = data_list[data_index]
-        except IndexError:
-            return Constant.INVALID_VALUE
-        return time_value
-
-    def get_sub_category_time(self, category: str, time_list: list, total_duration: float):
-        sub_time_dict = {}
-        for time_name in time_list:
-            time_value = self.get_time_value(time_name, self._comparison_data)
-            if time_value == Constant.INVALID_VALUE:
-                continue
-            sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, ""))
-            duration, num = self.split_duration_and_num(time_value)
-            sub_time_dict.setdefault(f"Duration(s)", []).append(duration)
-            sub_time_dict.setdefault(f"Duration Ratio", []).append(
-                "{:.2%}".format(self.calculate_ratio(duration, total_duration)))
-            sub_time_dict.setdefault(f"Kernel Number", []).append(num)
-        self.cur_data[self.time_name_map.get(category)] = sub_time_dict
+        self._is_minimal_profiling = self._disaggregate_perf.get("minimal_profiling", False)
+        self.cur_data["overall_data"] = self.get_time_category_dict(self._disaggregate_perf.get('overall', {}))
+        if self._has_benchmark_profiling:
+            self._disaggregate_benchmark_perf = ComparisonInterface(
+                self.benchmark_profiling_path).disaggregate_perf(Constant.OVERALL_COMPARE)
 
     def identify_bottleneck(self):
         overall_data = self.cur_data.get("overall_data")
         if not overall_data:
             return
         e2e_time = '%.3f' % sum([data for data in overall_data.values()])
-        overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n"
+        overall_bottleneck = f"The Model E2E Time is {e2e_time}ms.\n"
         comparison_bottleneck = ""
         for time_type, time_value in overall_data.items():
-            # add subtype time bottleneck
-            self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n"
             # add overall bottleneck
-            overall_bottleneck += f"    -- {time_type} is {time_value}s\n"
+            overall_bottleneck += f"    -- {time_type} is {time_value}ms\n"
             if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value,
                                                                                                 e2e_time) > 0.1:
                 overall_bottleneck += "percentage of free time exceed the threshold 10%."
-            if not self._has_base_collection:
+            if not self._has_benchmark_profiling:
                 continue
             # add comparison bottleneck
-            time_type_origin = "Uncovered Communication Time(Wait Time)" \
-                if time_type == "Uncovered Communication Time" else time_type
-            base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data))
+            base_duration = self.get_time_category_dict(
+                self._disaggregate_benchmark_perf.get('overall', {})
+            ).get(time_type)
             if time_value > base_duration:
                 ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration))
                 comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n"
         self.cur_bottleneck["overall_data"] = overall_bottleneck
         if comparison_bottleneck:
             self.cur_bottleneck["comparison_result"] = comparison_bottleneck
+
     def optimize(self, **kwargs):
         if self.path_check():
             self.process()
         self.identify_bottleneck()
         self.format_bottleneck()
-        self.format_cur_data()
+        self.format_over_summary_analysis()
         self.make_record()
         self.make_render()
         return self.result
 
     def format_bottleneck(self):
         result = ''
-        headers = []
-        data_list = []
-        data = []
-        for key, value in self.cur_bottleneck.items():
+        for _, value in self.cur_bottleneck.items():
             if not value:
                 continue
-            result += f'{key}: {value} \n'
-            headers.append(key)
-            data.append(value)
-        data_list.append(data)
+            result += f'{value} \n'
         self.bottleneck_str = result
-        self.bottleneck_table["headers"] = headers
-        self.bottleneck_table["data"] = data_list
 
-    def format_cur_data(self):
-        if not self.cur_data:
-            return
-        for data_type, data in self.cur_data.items():
-            if not data:
-                continue
-            if data_type not in list(self.time_name_map.values()):
-                data_list = list(data.values())
-            else:
-                data_list = [','.join(map(str, value)) for value in data.values()]
-            headers = list(data.keys())
-            data_table = {"headers": headers, "data": [data_list]}
-            self.cur_data_table[data_type] = copy.deepcopy(data_table)
+    def format_over_summary_analysis(self):
+        headers = ['Performance Index', 'Duration(ms)', 'Duration Ratio']
+        performance_data = self.get_analysis_data(self._disaggregate_perf)
+        benchmark_data = self.get_analysis_data(self._disaggregate_benchmark_perf)
+        if self._has_benchmark_profiling:
+            headers.append('Diff Duration(ms)')
+            self.format_analysis_with_benchmark(performance_data, benchmark_data, headers)
+        else:
+            self.format_analysis_only(performance_data, headers)
+
+    def get_analysis_data(self, data_dict: dict):
+        if not data_dict:
+            return {}
+        return {
+            **data_dict.get("overall"),
+            **data_dict.get("computing_time_disaggregate"),
+            **data_dict.get("communication_time_disaggregate"),
+            **data_dict.get("free_time_disaggregate"),
+        }
 
+    def format_analysis_only(self, performance_data: dict, headers: list):
+        res = []
+        total_duration = performance_data.get('e2e_time_ms', 0.0)
+        for time_name, time_key in self.performance_time_dict.items():
+            row = [time_name]
+            duration = performance_data.get(time_key, 0.0)
+            row.append("{:.3f}".format(duration))
+            row.append("{:.2%}".format(self.calculate_ratio(duration, total_duration)))
+            res.append(row)
+        self.over_summary_analysis["headers"] = headers
+        self.over_summary_analysis["data"] = res
+
+    def format_analysis_with_benchmark(self, performance_data: dict, benchmark_data: dict, headers: list):
+        res = []
+        total_duration = performance_data.get('e2e_time_ms', 0.0)
+        for time_name, time_key in self.performance_time_dict.items():
+            row = [time_name]
+            duration = performance_data.get(time_key, 0.0)
+            row.append("{:.3f}".format(duration))
+            row.append("{:.2%}".format(self.calculate_ratio(duration, total_duration)))
+            row.append("{:.3f}".format(duration - benchmark_data.get(time_key, 0.0)))
+            res.append(row)
+        self.over_summary_analysis["headers"] = headers
+        self.over_summary_analysis["data"] = res
 
     def make_record(self):
         """
@@ -232,20 +208,23 @@ class OverallSummaryAnalyzer(BaseAnalyzer):
         )
         self.result.add(OptimizeRecord(optimization_item))
 
-        self.result.add_detail(const.BOTTLENECK,  self.bottleneck_table["headers"], self.bottleneck_table["data"][0])
-        for data_type, data_dict in self.cur_data_table.items():
-            if data_dict:
-                self.result.add_detail(const.DATA + data_type, data_dict["headers"], data_dict["data"][0])
+        self.result.add_detail(
+            OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER,
+            headers=self.over_summary_analysis["headers"]
+        )
+        for data in self.over_summary_analysis["data"]:
+            self.result.add_detail(OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, detail=data)
 
     def make_render(self):
         if not self.bottleneck_str and not self.cur_advices:
             return
+        # 将\n替换为html换行
+        bottleneck_str = self.bottleneck_str.replace('\n', '<br />')
         result_for_html = {
-            "Description" : self.bottleneck_str,
-            "suggestion" : self.cur_advices,
-            "details" : [self.bottleneck_table]
+            "Description": bottleneck_str,
+            "suggestion": self.cur_advices,
+            "details": [self.over_summary_analysis]
         }
-
         self.html_render.render_template(key="overall",
                                          title=OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER,
                                          template_dir="templates",
@@ -254,9 +233,10 @@ class OverallSummaryAnalyzer(BaseAnalyzer):
                                          torch_version=self.torch_version,
                                          result=result_for_html)
 
+
 def get_profile_path(collection_path):
     for root, dirs, files in os.walk(collection_path):
         for file in files:
             if file.startswith("profiler_info"):
                 return root
-    return ""
\ No newline at end of file
+    return ""
diff --git a/profiler/advisor/analyzer/schedule/syncbn/__init__.py b/profiler/advisor/analyzer/schedule/syncbn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6dfce5f0b9733baec9be2b28bd3389349e4436
--- /dev/null
+++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py
@@ -0,0 +1,30 @@
+import logging
+
+from typing import List, Dict, Any
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.analyzer.schedule.syncbn.syncbn_checker import SyncBNChecker
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+
+logger = logging.getLogger()
+
+
+class SyncBNAnalyzer(BaseAnalyzer):
+    dataset_cls_list = [TimelineEventDataset]
+
+    def __init__(self, collection_path, **kwargs):
+        super().__init__(collection_path, **kwargs)
+        self.result = OptimizeResult()
+        self.html_render = HTMLRender()
+        key = TimelineEventDataset.get_key()
+        self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key)
+
+    @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),))
+    def optimize(self, **kwargs):
+        syncbn_checker = SyncBNChecker()
+        syncbn_checker.check_syncbn(self.timeline_event_dataset)
+        syncbn_checker.make_record(self.result)
+        syncbn_checker.make_render(self.html_render)
+        return self.result
diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..83988c4e60b0cc653083c5b68978fa20c684dccd
--- /dev/null
+++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py
@@ -0,0 +1,70 @@
+import logging
+import os
+
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.cluster_analyse.common_func.file_manager import FileManager
+
+logger = logging.getLogger()
+
+
+class SyncBNChecker:
+
+    def __init__(self):
+        self.optimization_item = []
+        self.syncbn_issues = False
+        self.desc = ""
+        self.suggestions = []
+        self.solutions = None
+        self.max_syncbn_num = None
+        self._init_rule()
+
+    def check_syncbn(self, event_dataset: TimelineEventDataset):
+        """
+        :Param event_dataset: dataset of timeline event
+        """
+        if not hasattr(event_dataset, "sync_batchnorm") or not getattr(event_dataset, "sync_batchnorm"):
+            logger.debug("Skip syncbn checker, because no syncbn found")
+            return
+
+        syncbn_num = len(event_dataset.sync_batchnorm)
+        self.syncbn_issues = syncbn_num >= self.max_syncbn_num
+        self.desc = self.desc.format(syncbn_num=syncbn_num)
+
+    def make_record(self, result: OptimizeResult):
+        """
+        make record for what and how to optimize
+        """
+        if not self.syncbn_issues:
+            return
+
+        self.optimization_item.append(OptimizeItem("SyncBatchNorm", self.desc, self.suggestions))
+        for optimization in self.optimization_item:
+            result.add(OptimizeRecord(optimization))
+
+    def make_render(self, html_render):
+        if not self.syncbn_issues:
+            return
+        html_render.render_template(key="schedule",
+                                    template_dir="templates",
+                                    template_name="sync_batchnorm.html",
+                                    desc=self.desc,
+                                    solutions=self.solutions)
+
+    def _init_rule(self):
+        syncbn_rule_path = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))),
+            "rules",
+            "sync_batchnorm.yaml"
+        )
+
+        syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path)
+
+        self.max_syncbn_num = syncbn_rule.get("max_syncbn_num")
+        self.desc = syncbn_rule.get("problem")
+
+        self.solutions = syncbn_rule.get("solutions")
+        for solution in self.solutions:
+            for key, val in solution.items():
+                self.suggestions.append(f"{key}, {val.get('desc')}")
diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/__init__.py b/profiler/advisor/analyzer/schedule/synchronize_stream/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..88e55449c5560a528defb3f91be58ecf57a6f120
--- /dev/null
+++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py
@@ -0,0 +1,32 @@
+import logging
+
+from typing import List, Dict, Any
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_checker import SynchronizeStreamChecker
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+
+logger = logging.getLogger()
+
+
+class SynchronizeStreamAnalyzer(BaseAnalyzer):
+    dataset_cls_list = [TimelineEventDataset]
+
+    def __init__(self, collection_path, **kwargs):
+        super().__init__(collection_path, **kwargs)
+        self.result = OptimizeResult()
+        self.html_render = HTMLRender()
+
+        key = TimelineEventDataset.get_key()
+        self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key)
+
+    @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),))
+    def optimize(self, **kwargs):
+
+        synchronize_stream_checker = SynchronizeStreamChecker()
+        synchronize_stream_checker.check_synchronize(self.timeline_event_dataset, kwargs.get("profiling_with_stack"))
+        synchronize_stream_checker.make_record(self.result)
+        synchronize_stream_checker.make_render(self.html_render)
+        return self.result
diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d88d281ca013389ee866ad5ffea3db9b6d517a
--- /dev/null
+++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py
@@ -0,0 +1,89 @@
+import logging
+
+from profiler.advisor.common import constant as const
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker
+from profiler.advisor.utils.utils import format_timeline_result
+
+logger = logging.getLogger()
+
+
+class SynchronizeStreamChecker(TimelineBaseChecker):
+
+    def __init__(self):
+        super().__init__(n_processes=1)
+        self.optimization_item = []
+        self.synchronize_issues = False
+        self.desc = ""
+        self.suggestions = []
+        self.solutions = []
+        self.max_synchronize_num = None
+
+    def check_synchronize(self, event_dataset: TimelineEventDataset, profiling_with_stack=None):
+        """
+        :Param event_dataset: dataset of timeline event
+        """
+        if not hasattr(event_dataset, "synchronize_stream") or not getattr(event_dataset, "synchronize_stream"):
+            logger.debug("Skip synchronize stream checker, because no synchronize stream found")
+            return
+
+        synchronize_num = event_dataset.synchronize_stream.total_count
+        slow_synchronize_stream = event_dataset.synchronize_stream.slow_synchronize_stream
+        total_slow_synchronize_time = sum((float(sync_stream.dur) for sync_stream in slow_synchronize_stream))
+
+        synchronize_stream_rule = event_dataset.synchronize_stream.rule
+        self.max_synchronize_num = synchronize_stream_rule.get("max_synchronize_num")
+        self.synchronize_issues = synchronize_num >= self.max_synchronize_num and len(slow_synchronize_stream) > 0
+        if not self.synchronize_issues:
+            return
+
+        for sync_stream in slow_synchronize_stream:
+            if sync_stream.name not in self._matched_op_index:
+                self._matched_op_index[sync_stream.name] = []
+            self._matched_op_index[sync_stream.name].append(sync_stream.dataset_index)
+        self.query_stack(event_dataset, profiling_with_stack)
+
+        self.desc = synchronize_stream_rule.get("problem")
+        self.desc = self.desc.format(synchronize_num=synchronize_num,
+                                     slow_synchronize_num=len(slow_synchronize_stream),
+                                     total_synchronize_stream_time=total_slow_synchronize_time)
+
+        solutions = synchronize_stream_rule.get("solutions")
+        for solution in solutions:
+            renderer_solution = {}
+            for key, val in solution.items():
+                if self.empty_stacks and self.framework_black_list:
+                    # 如果堆栈源于torch, torch_npu等框架，则不提示修改的代码
+                    if "modify code" in key.lower():
+                        continue
+                self.suggestions.append(f"{key}, {val.get('desc')}")
+                renderer_solution.update({key: val})
+            self.solutions.append(renderer_solution)
+
+    def make_record(self, result: OptimizeResult):
+        """
+        make record for what and how to optimize
+        """
+        if not self.synchronize_issues:
+            return
+
+        self.optimization_item.append(OptimizeItem("SynchronizeStream", self.desc, self.suggestions))
+        for optimization in self.optimization_item:
+            result.add(OptimizeRecord(optimization))
+
+    def make_render(self, html_render):
+        if not self.synchronize_issues:
+            return
+
+        format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True)
+        html_render.render_template(key="schedule",
+                                    template_dir="templates",
+                                    template_name="synchronize_stream.html",
+                                    desc=self.desc,
+                                    solutions=self.solutions,
+                                    result=format_result_for_html,
+                                    with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL,
+                                    empty_stacks=self.empty_stacks,
+                                    framework_black_list=self.framework_black_list)
diff --git a/profiler/advisor/analyzer/schedule/timeline_base_checker.py b/profiler/advisor/analyzer/schedule/timeline_base_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc69150263c11006979f64d12df1dde29a45f15
--- /dev/null
+++ b/profiler/advisor/analyzer/schedule/timeline_base_checker.py
@@ -0,0 +1,91 @@
+from abc import ABC, abstractmethod
+import multiprocessing
+import logging
+
+from profiler.advisor.common import constant as const
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.result import OptimizeResult
+
+logger = logging.getLogger()
+
+
+class TimelineBaseChecker(ABC):
+
+    def __init__(self, n_processes: int = 1):
+        self.n_processes = n_processes
+        self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict()
+        self.matched_op_stacks = {}
+        self.empty_stacks = True
+        self.framework_black_list = False
+
+    @abstractmethod
+    def make_record(self, result: OptimizeResult):
+        pass
+
+    @abstractmethod
+    def make_render(self, html_render):
+        pass
+
+    def query_stack(self, event_dataset: TimelineEventDataset = None, profiling_with_stack: str = None):
+        if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]):
+            return
+
+        event_dataset = event_dataset if not profiling_with_stack else TimelineEventDataset(
+            collection_path=profiling_with_stack, data={}, _datasets={}, analysis_mode="fusion_ops",
+            build_dataset=False)
+
+        op_stack_list = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index)
+        for op_stack in op_stack_list:
+            for op, stack in op_stack.items():
+                if op not in self.matched_op_stacks:
+                    self.matched_op_stacks[op] = {}
+                if stack == const.TIMELINE_FUSION_OPS_NO_STACK_FLAG:
+                    continue
+                if stack not in self.matched_op_stacks[op]:
+                    self.matched_op_stacks[op][stack] = 0
+                self.matched_op_stacks[op][stack] += 1
+
+    def _query_stack_by_matched_index(self, index, event):
+        stack_record = {}
+        event = TimelineEvent(event)
+
+        matched_ops = []
+        for op, matched_index in self._matched_op_index.items():
+            if index not in matched_index:
+                continue
+
+            matched_ops.append(op)
+            stack = event.args.get(const.CALL_STACKS)
+
+            if not stack:
+                logger.debug("Got empty '%s' for event %s", const.CALL_STACKS, event)
+                continue
+
+            if not self._is_keep_stack(stack):
+                self.framework_black_list = True
+                logger.debug("Drop stack from framework %s", const.FRAMEWORK_STACK_BLACK_LIST)
+                continue
+
+            if self.empty_stacks and stack:
+                self.empty_stacks = False
+
+            stack_record[op] = stack
+
+        if matched_ops and not stack_record:
+            for op in matched_ops:
+                stack_record[op] = const.TIMELINE_FUSION_OPS_NO_STACK_FLAG
+
+        return stack_record
+
+    def _is_keep_stack(self, stack):
+        # 过滤掉torch, torch_npu, megatron, deepspeed等框架下的堆栈，这些源码基本是不能被修改的
+        stack_list = stack.replace("\\r\\n", ";").split(";")
+        if not stack_list:
+            return False
+
+        final_called_stack = stack_list[0]
+        for framework in const.FRAMEWORK_STACK_BLACK_LIST:
+            if framework in final_called_stack.split("/"):
+                return False
+        return True
diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py
index 592f9d421e2bfad53a9ea621d951ae0166221623..52e3e07554f354deb62222ee0de6e66ef8b07e2e 100644
--- a/profiler/advisor/common/analyzer_scopes.py
+++ b/profiler/advisor/common/analyzer_scopes.py
@@ -12,3 +12,7 @@ class SupportedScopes:
     BLOCK_DIM_ANALYSIS = "block_dim_analysis"
     OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis"
     TIMELINE_OP_DISPATCH = "timeline_op_dispatch"
+    DATALOADER = "dataloader"
+    SYNCBN = "syncbn"
+    SYNCHRONIZE_STREAM = "synchronize_stream"
+    FREQ_ANALYSIS = "freq_analysis"
diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py
index 697430ee6cabad8c055176a3368a8b4a25e977ab..87245a43ea33981e929a16717357d9c7d1713aff 100644
--- a/profiler/advisor/common/constant.py
+++ b/profiler/advisor/common/constant.py
@@ -26,6 +26,7 @@ ENQUEUE = "enqueue"
 TORCH_TO_NPU = "torch_to_npu"
 OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute"
 OP_COMPILE_ID = "aclopCompileAndExecute"
+SYNC_STREAM = "AscendCL@aclrtSynchronizeStream"
 MAX_OP_COMPILE_NUM = 20
 ACL_TO_NPU = "acl_to_npu"
 TASK_TYPE = "Task Type"
@@ -111,7 +112,7 @@ HTTP_PREFIXES = "http://"
 HTTPS_PREFIXES = "https://"
 COMMON_YAML_DIR = "modelarts/solution/ma_advisor_rules/"
 COMMON_ENDPOINT_SUFFIX = "obs.{}.myhuaweicloud.com"
-INNER_ENDPOINT_SUFFIX= "obs.{}.ulanqab.huawei.com"
+INNER_ENDPOINT_SUFFIX = "obs.{}.ulanqab.huawei.com"
 
 AICPU_RULES_YAML_NAME = "aicpu_rules.yaml"
 FUSION_PASS_YAML_NAME = "op_fusion_pass.yaml"
@@ -138,4 +139,8 @@ CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv"
 CLUSTER_COMM_JSON = "cluster_communication.json"
 
 BOTTLENECK = "bottleneck"
-DATA = "data"
\ No newline at end of file
+DATA = "data"
+
+FRAMEWORK_STACK_BLACK_LIST = ["torch", "torch_npu", "megatron", "deepspeed"]
+DISABLE_STREAMING_READER = "DISABLE_STREAMING_READER"
+MAX_FILE_SIZE = 10**10
diff --git a/profiler/advisor/common/graph/graph_parser.py b/profiler/advisor/common/graph/graph_parser.py
index d4c67fc1918af37a837e016bd9e5b813957b1aef..ef4dc4d681e0664c12120c9c8904ad48970a5840 100644
--- a/profiler/advisor/common/graph/graph_parser.py
+++ b/profiler/advisor/common/graph/graph_parser.py
@@ -1,11 +1,12 @@
 import os
 import logging
-import yaml
 import itertools
 from collections import deque
 from dataclasses import dataclass
 from typing import List, Tuple, Dict
 
+from profiler.cluster_analyse.common_func.file_manager import FileManager
+
 logger = logging.getLogger()
 
 
@@ -344,9 +345,9 @@ class QueryGraphParser:
 
         if not os.path.exists(rule_database):
             raise FileNotFoundError(f"Path {rule_database} does not exist.")
-        with open(rule_database, 'r') as f:
-            database = yaml.safe_load(f)
-            self.parse_yaml(database)
+
+        database = FileManager.read_yaml_file(rule_database)
+        self.parse_yaml(database)
 
     def parse_yaml(self, yaml_database):
         fusion_strategy_list = yaml_database.get("GraphFusion", [])
diff --git a/profiler/advisor/common/profiling/ge_info.py b/profiler/advisor/common/profiling/ge_info.py
index 9996ec611a2a835bd8dffd24c3fbe7d8817ec29a..4fd5846d88ddbab5d898c020b76537c1ec52db3b 100644
--- a/profiler/advisor/common/profiling/ge_info.py
+++ b/profiler/advisor/common/profiling/ge_info.py
@@ -17,12 +17,13 @@ class GeInfo(ProfilingParser):
     """
     ge info file
     """
-    FILE_PATTERN = r"ge_info.db"
     FILE_PATTERN_MSG = "ge_info.db"
     FILE_INFO = "ge info"
     STATIC_OP_STATE = "0"
     DYNAMIC_OP_STATE = "1"
 
+    file_pattern_list = [r"ge_info.db"]
+
     def __init__(self, path: str) -> None:
         super().__init__(path)
         self.op_state_info_list = None
diff --git a/profiler/advisor/common/profiling/msprof.py b/profiler/advisor/common/profiling/msprof.py
index 9453986b8225ccad68f2135d674e3832d987fcf0..750c5481e67e31e5e85c4a38ae3a299abed70187 100644
--- a/profiler/advisor/common/profiling/msprof.py
+++ b/profiler/advisor/common/profiling/msprof.py
@@ -33,10 +33,11 @@ class Msprof(ProfilingParser):
     msprof
 
     """
-    FILE_PATTERN = r"^msprof[_\d]+.json$"
     FILE_PATTERN_MSG = "msprof_*.json"
     FILE_INFO = "msprof"
 
+    file_pattern_list = [r"^msprof[_\d]+.json$"]
+
     def __init__(self, path: str) -> None:
         super().__init__(path)
         self._tasks: List[TaskInfo] = []
diff --git a/profiler/advisor/common/profiling/op_summary.py b/profiler/advisor/common/profiling/op_summary.py
index d79439dbad8e2c105bed737c1a1c3be1a2cecfc1..4744b5029ad6f06d5ee5a60426fb9b40b0a8c3c8 100644
--- a/profiler/advisor/common/profiling/op_summary.py
+++ b/profiler/advisor/common/profiling/op_summary.py
@@ -16,13 +16,13 @@ class OpSummary(ProfilingParser):
     """
     op summary
     """
-
-    FILE_PATTERN = r"^op_summary_[_\d]+\.csv$"
     FILE_PATTERN_MSG = "op_summary_*.csv"
     FILE_INFO = "op summary"
     STATIC_OP_STATE = "static"
     DYNAMIC_OP_STATE = "dynamic"
 
+    file_pattern_list = [r"^op_summary_[_\d]+\.csv$"]
+
     def __init__(self, path: str) -> None:
         super().__init__(path)
         self.op_list: List[OpInfo] = []
diff --git a/profiler/advisor/common/profiling/tasktime.py b/profiler/advisor/common/profiling/tasktime.py
index 3ce09a783851e94163aa72f423788a373da5eb3a..732ff0f36796049fee5ff2521360ca3183ceafce 100644
--- a/profiler/advisor/common/profiling/tasktime.py
+++ b/profiler/advisor/common/profiling/tasktime.py
@@ -17,11 +17,11 @@ class TaskTime(ProfilingParser):
     """
     task time info
     """
-
-    FILE_PATTERN = r"^task_time_[_\d]+\.json$"
     FILE_PATTERN_MSG = "task_time*.json"
     FILE_INFO = "task time"
 
+    file_pattern_list = [r"^task_time_[_\d]+\.json$"]
+
     def __init__(self, path: str) -> None:
         super().__init__(path)
         self._tasks: List[TaskInfo] = []
diff --git a/profiler/advisor/common/timeline/fusion_ops_db.py b/profiler/advisor/common/timeline/fusion_ops_db.py
index 8637befd1ab108928bdf8f4fdb19d9cab03ff960..64cc849295ffb6758d1fc8fd77d71e13d0157204 100644
--- a/profiler/advisor/common/timeline/fusion_ops_db.py
+++ b/profiler/advisor/common/timeline/fusion_ops_db.py
@@ -1,13 +1,12 @@
 import logging
 import os
 
-import yaml
-
 from profiler.advisor.common import constant
 from profiler.advisor.common.timeline.fusion_ops_rule import OpRule
 from profiler.advisor.common.timeline.fusion_ops_rule_handler import TimelineOpRuleHandler
 from profiler.advisor.utils.log import get_log_level
 from profiler.advisor.utils.utils import get_file_path_by_walk
+from profiler.cluster_analyse.common_func.file_manager import FileManager
 
 logger = logging.getLogger()
 logger.setLevel(get_log_level())
@@ -241,8 +240,7 @@ class FusionOperatorDB:
 
         logger.debug("The rule yaml file is successfully found in path: %s", os.path.abspath(file_path))
 
-        with open(file_path, "rb") as file:
-            db_content = yaml.safe_load(file)
+        db_content = FileManager.read_yaml_file(file_path)
 
         if not self._is_version_supported(db_content):
             self.is_empty = True
diff --git a/profiler/advisor/config/config.ini b/profiler/advisor/config/config.ini
index c56c1dad9f0d7e9ac02ab76b0e79e102b010da12..06e993160104a5c8e044b1e385f6713470637831 100644
--- a/profiler/advisor/config/config.ini
+++ b/profiler/advisor/config/config.ini
@@ -9,6 +9,7 @@ tune_ops_file = operator_tuning_file.cfg
 [THRESHOLD]
 # operator_bound_ratio: (mte, cube, vector, scalar) ratio greater than this value will be checked in operator_bound_checker
 operator_bound_ratio = 0.8
+frequency_threshold = 0.05
 [RULE-BUCKET]
 # region : URL of different regions where can download rule yaml file
 cn-north-9 = cnnorth9-modelarts-sdk
diff --git a/profiler/advisor/config/config.py b/profiler/advisor/config/config.py
index 12f4526f8c95a747f97272aed6cf8e4e822da676..4f36dfedfc8e1624f3740951ea982b8cd3196657 100644
--- a/profiler/advisor/config/config.py
+++ b/profiler/advisor/config/config.py
@@ -97,6 +97,13 @@ class Config:
         """
         return float(self.config.get("THRESHOLD", "operator_bound_ratio"))
 
+    @property
+    def frequency_threshold(self) -> float:
+        """
+        frequency_threshold
+        """
+        return float(self.config.get("THRESHOLD", "frequency_threshold"))
+
     def set_log_path(self, result_file: str, log_path: str = None):
         self.log_path = log_path if log_path is not None else os.path.join(self._work_path, "log")
         os.makedirs(self.log_path, exist_ok=True)
diff --git a/profiler/advisor/config/profiling_data_version_config.yaml b/profiler/advisor/config/profiling_data_version_config.yaml
index 4ef76105a07c28c5072c4bbfe20fd39a938038b7..b8c92fe074d3bf67a23214d18f6a2438be130314 100644
--- a/profiler/advisor/config/profiling_data_version_config.yaml
+++ b/profiler/advisor/config/profiling_data_version_config.yaml
@@ -1,18 +1,19 @@
 versions:
   - version: 8.0.RC1
     dirs_pattern:
+      ASCEND_PROFILER_OUTPUT: [ op_summary ]
       ^PROF_\d{6}_\d{17}_\w+$:
-        mindstudio_profiler_output:
-          [ op_summary, msprof ]
+        mindstudio_profiler_output: [ op_summary, msprof ]
     class_attr:
       op_summary: OpSummary
       msprof: Msprof
     file_attr:
-      op_summary: ^op_summary_\d{14}\.csv$
       msprof: ^msprof_\d{14}\.json$
+      op_summary: [ kernel_details.csv, '^op_summary_\d{14}\.csv$' ]
 
   - version: 7.0.0
     dirs_pattern:
+      ASCEND_PROFILER_OUTPUT: [ op_summary ]
       ^PROF_\d{6}_\d{17}_\w+$:
         ^device_\d+$:
           summary:
@@ -28,13 +29,14 @@ versions:
       msprof: Msprof
       ge_info: GeInfo
     file_attr:
-      op_summary: ^op_summary_\d+_\d+_\d{14}\.csv$
+      op_summary: [ kernel_details.csv, '^op_summary_\d+_\d+_\d{14}\.csv$']
       task_time: ^task_time_\d+_\d+_\d{14}\.json$
       msprof: ^msprof_\d+_\d+_\d{14}\.json$
       ge_info: ge_info.db
 
   - version: 7.0.RC1
     dirs_pattern:
+      ASCEND_PROFILER_OUTPUT: [ op_summary ]
       ^PROF_\d{6}_\d{17}_\w+$:
         ^device_\d+$:
           summary:
@@ -50,13 +52,14 @@ versions:
       msprof: Msprof
       ge_info: GeInfo
     file_attr:
-      op_summary: ^op_summary_\d+_\d+_\d+_\d{14}\.csv$
+      op_summary: [ kernel_details.csv, '^op_summary_\d+_\d+_\d+_\d{14}\.csv$']
       task_time: ^task_time_\d+_\d+_\d+_\d{14}\.json$
       msprof: ^msprof_\d+_\d+_\d+_\d{14}\.json$
       ge_info: ge_info.db
 
   - version: 6.3.RC2
     dirs_pattern:
+      ASCEND_PROFILER_OUTPUT: [ op_summary ]
       ^PROF_\d{6}_\d{17}_\w+$:
         ^device_\d+$:
           summary:
@@ -72,9 +75,7 @@ versions:
       msprof: Msprof
       ge_info: GeInfo
     file_attr:
-      op_summary: ^op_summary_\d+_\d+\.csv$
+      op_summary: [ kernel_details.csv, '^op_summary_\d+_\d+\.csv$']
       task_time: ^task_time_\d+_\d+\.json$
       msprof: ^msprof_\d+_\d+\.json$
       ge_info: ge_info.db
-
-
diff --git a/profiler/advisor/dataset/ai_core_freq/__init__.py b/profiler/advisor/dataset/ai_core_freq/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor/dataset/ai_core_freq/ai_core_freq_dataset.py b/profiler/advisor/dataset/ai_core_freq/ai_core_freq_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99baea6564eeae5efd9585342d1f40a40ea745d
--- /dev/null
+++ b/profiler/advisor/dataset/ai_core_freq/ai_core_freq_dataset.py
@@ -0,0 +1,148 @@
+import json
+import logging
+import math
+import os
+import traceback
+
+import ijson
+from tqdm import tqdm
+
+from profiler.advisor.common import constant as const
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.advisor.utils.utils import get_file_path_from_directory
+from profiler.advisor.utils.utils import convert_to_float, parse_json_with_generator
+from profiler.advisor.dataset.profiling.device_info import DeviceInfoParser
+from profiler.advisor.config.config import Config
+
+logger = logging.getLogger()
+
+
+class AICoreFreqDataset:
+
+    def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None:
+
+        self._profiler_step = []
+        self._ai_core_ops = []
+        self._ai_core_freq: [TimelineEvent] = []
+        self._previous_freq_index = -1
+
+        self.timeline_dir = collection_path
+        self.timeline_data_list = get_file_path_from_directory(collection_path,
+                                                               lambda file: file.endswith("trace_view.json"))
+
+        self.step = kwargs.get("step")
+        self.op_freq = {}
+        info = DeviceInfoParser(collection_path)
+        info.parse_data()
+        if not Config().get_config("aic_frequency"):
+            return
+        if self.parse():
+            key = self.get_key()
+            if key not in data:
+                data[key] = []
+            data[key].append(self)
+
+    @property
+    def profiler_step(self):
+        return self._profiler_step
+
+    @property
+    def ai_core_freq(self):
+        return self._ai_core_freq
+
+    @property
+    def ai_core_ops(self):
+        return self._ai_core_ops
+
+    @classmethod
+    def get_key(cls):
+        """
+        get key of dataset
+        :return: key
+        """
+        return cls.__module__.rsplit('.', maxsplit=1)[-1]
+
+    def parse(self):
+
+        if len(self.timeline_data_list) == 0:
+            logger.warning("Please ensure trace_view.json in %s, skip timeline analysis.", self.timeline_dir)
+            return False
+
+        if len(self.timeline_data_list) > 1:
+            logger.warning("Found multiple trace_view.json in %s, load the file of device 0 for analysis  .",
+                           self.timeline_dir)
+
+        _ = parse_json_with_generator(sorted(self.timeline_data_list)[0], self._add_event)
+
+        target_ai_core_ops = self._get_target_ai_core_ops()
+        self._get_op_frequency(target_ai_core_ops)
+        return True
+
+    def _add_profiler_step(self, event):
+        if event.name.startswith("ProfilerStep"):
+            self._profiler_step.append(event)
+
+    def _add_ai_core_ops(self, event):
+        if event.args.get("Task Type") in ["MIX_AIC", "AI_CORE"]:
+            self._ai_core_ops.append(event)
+
+    def _add_ai_core_freq(self, event):
+        if event.name == "AI Core Freq":
+            if self._previous_freq_index != -1:
+                self._ai_core_freq[self._previous_freq_index]["end"] = event.get("ts", float(math.inf))
+            self._previous_freq_index += 1
+            event.setdefault("end", float(math.inf))
+            self._ai_core_freq.append(event)
+
+    def _add_event(self, index, event):
+        event["dataset_index"] = index
+        if not isinstance(event, TimelineEvent):
+            event = TimelineEvent(event)
+
+        self._add_profiler_step(event)
+        self._add_ai_core_ops(event)
+        self._add_ai_core_freq(event)
+
+        return True
+
+    def _get_target_ai_core_ops(self):
+        target_ai_core_ops = []
+        if not self.step or f"ProfilerStep#{self.step}" not in [event.name for event in self._profiler_step]:
+            target_ai_core_ops = self._ai_core_ops
+        else:
+            for step_event in self._profiler_step:
+                if step_event.name != f"ProfilerStep#{self.step}":
+                    continue
+
+                for ai_core_op_event in self._ai_core_ops:
+                    if step_event.ts_include(ai_core_op_event):
+                        target_ai_core_ops.append(ai_core_op_event)
+        target_ai_core_ops = sorted(target_ai_core_ops, key=lambda x: float(x.ts))
+        return target_ai_core_ops
+
+    def _get_op_frequency(self, ai_core_ops):
+        ai_core_freq = sorted(self._ai_core_freq, key=lambda x: float(x.ts))
+
+        op_index, freq_index = 0, 0
+        while op_index < len(ai_core_ops) and freq_index < len(ai_core_freq):
+            op_event = ai_core_ops[op_index]
+            op_end_time = convert_to_float(op_event.ts) + convert_to_float(op_event.dur)
+            op_freq_list = []
+            while freq_index < len(ai_core_freq):
+                freq_event = ai_core_freq[freq_index]
+                if convert_to_float(freq_event.end) < op_end_time:
+                    op_freq_list.append(convert_to_float(freq_event.args.MHz))
+                    freq_index += 1
+                    continue
+                elif convert_to_float(freq_event.ts) < op_end_time:
+                    if op_event.name not in self.op_freq:
+                        self.op_freq[op_event.name] = {"count": 0, "dur": 0, "freq_list": []}
+                    self.op_freq[op_event.name]["count"] += 1
+                    self.op_freq[op_event.name]["dur"] += convert_to_float(op_event.dur)
+                    op_freq_list.append(convert_to_float(freq_event.args.MHz))
+                    self.op_freq[op_event.name]["freq_list"].append(min(op_freq_list))
+                    break
+                else:
+                    break
+
+            op_index += 1
diff --git a/profiler/advisor/dataset/profiling/device_info.py b/profiler/advisor/dataset/profiling/device_info.py
index b58930777f969d023eab7885a9095d46aa7ba6ea..110cd0794c6cb153644b9d2e59c7d0793eb280b4 100644
--- a/profiler/advisor/dataset/profiling/device_info.py
+++ b/profiler/advisor/dataset/profiling/device_info.py
@@ -54,6 +54,8 @@ class DeviceInfoParser:
                 config.set_config("device_id", device_info["id"])
             if "aiv_num" in device_info:
                 config.set_config("aiv_num", device_info["aiv_num"])
+            if "aic_frequency" in device_info:
+                config.set_config("aic_frequency", device_info["aic_frequency"])
             if "ai_core_num" in device_info:
                 config.set_config("ai_core_num", device_info["ai_core_num"])
                 return True
diff --git a/profiler/advisor/dataset/profiling/profiling_dataset.py b/profiler/advisor/dataset/profiling/profiling_dataset.py
index 46d4a4fe8b12a419f6d0d7472f9776369e122f03..ebd90951abf5290d376efd13c257b90878343381 100644
--- a/profiler/advisor/dataset/profiling/profiling_dataset.py
+++ b/profiler/advisor/dataset/profiling/profiling_dataset.py
@@ -10,6 +10,7 @@ from profiler.advisor.common.profiling.tasktime import TaskTime
 from profiler.advisor.dataset.dataset import Dataset
 from profiler.advisor.dataset.profiling.device_info import DeviceInfoParser
 from profiler.advisor.utils.utils import join_prof_path
+from profiler.cluster_analyse.common_func.file_manager import FileManager
 
 
 logger = logging.getLogger()
@@ -42,14 +43,21 @@ class ProfilingDataset(Dataset):
                 self.build_from_pattern(value, join_prof_path(current_path, key))
         elif isinstance(dirs_pattern, list):
             for item in dirs_pattern:
+                if hasattr(self, item) and getattr(self, item):
+                    # 避免重复构建kernel_details.csv, op_summary.csv的数据对象
+                    continue
+                file_pattern_list = self.current_version_pattern.get('file_attr').get(item)
                 data_class = globals()[self.current_version_pattern.get('class_attr').get(item)]
-                data_class.FILE_PATTERN = self.current_version_pattern.get('file_attr').get(item)
+                if not hasattr(data_class, "file_pattern_list"):
+                    continue
+                setattr(data_class, "file_pattern_list", self.current_version_pattern.get('file_attr').get(item))
                 data_object = data_class(current_path)
                 is_success = data_object.parse_data()
                 if is_success:
                     setattr(self, item, data_object)
                 else:
-                    logger.warning("Skip parse %s from local path %s", self.current_version_pattern.get('class_attr').get(item), current_path)
+                    logger.info("Skip parse %s with file pattern %s from local path %s", 
+                                   self.current_version_pattern.get('class_attr').get(item), file_pattern_list, current_path)
         else:
             logger.warning(f"Unsupported arguments : %s to build %s", dirs_pattern, self.__class__.__name__)
 
@@ -69,8 +77,7 @@ class ProfilingDataset(Dataset):
             logger.warning("Skip parse profiling dataset, because %s does not exist.", config_path)
             return []
 
-        with open(config_path, 'r') as f:
-            patterns = yaml.safe_load(f)
+        patterns = FileManager.read_yaml_file(config_path)
 
         return patterns
 
diff --git a/profiler/advisor/dataset/profiling/profiling_parser.py b/profiler/advisor/dataset/profiling/profiling_parser.py
index bb4caeb29e5c94cbc4373b1d6b10e32f3e10e02e..51996617c2b83a3a1e4d1f873140957c8ff68b51 100644
--- a/profiler/advisor/dataset/profiling/profiling_parser.py
+++ b/profiler/advisor/dataset/profiling/profiling_parser.py
@@ -12,10 +12,10 @@ class ProfilingParser:
     """
     profiling
     """
-    FILE_PATTERN = ""
     FILE_PATTERN_MSG = ""
     FILE_INFO = ""
-    FILE_PATH = ""
+
+    file_pattern_list = []
 
     def __init__(self, path: str) -> None:
         self._path = path
@@ -37,15 +37,20 @@ class ProfilingParser:
         return False
 
     def _parse_from_file(self):
-        file_list = get_file_path_from_directory(self._path, self.file_match_func(self.FILE_PATTERN))
-        if not file_list:
-            return False
-        ## get last file
-        file = file_list[-1]
-        self.FILE_PATH = file
-        if len(file_list) > 1:
-            logger.warning("Multiple copies of %s were found, use %s", self.FILE_INFO, file)
-        return self.parse_from_file(file)
+
+        if not isinstance(self.file_pattern_list, list):
+            self.file_pattern_list = [self.file_pattern_list]
+
+        for file_pattern in self.file_pattern_list:
+            file_list = get_file_path_from_directory(self._path, self.file_match_func(file_pattern))
+            if not file_list:
+                continue
+            ## get last file
+            target_file = file_list[-1]
+            if len(file_list) > 1:
+                logger.warning("Multiple copies of %s were found, use %s", self.FILE_INFO, target_file)
+            return self.parse_from_file(target_file)
+        return False
 
     @staticmethod
     def get_float(data) -> float:
diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py
index d3889e4458fad8b34b5d811d152e255638999294..1504e65f54fd32398e6873de267992e59606fe4d 100644
--- a/profiler/advisor/dataset/timeline_event_dataset.py
+++ b/profiler/advisor/dataset/timeline_event_dataset.py
@@ -1,14 +1,16 @@
+import json
 import logging
-from typing import List
+import os
+from typing import List, Any
+import traceback
 
 import ijson
-from profiler.advisor.dataset.dataset import Dataset
 from tqdm import tqdm
+import yaml
 
 from profiler.advisor.common import constant as const
 from profiler.advisor.common.timeline.event import TimelineEvent
-from profiler.advisor.utils.utils import get_file_path_from_directory
-from profiler.advisor.utils.utils import singleton
+from profiler.advisor.utils.utils import get_file_path_from_directory, check_path_valid, singleton
 from profiler.cluster_analyse.common_func.file_manager import FileManager
 
 logger = logging.getLogger()
@@ -39,37 +41,76 @@ class OpCompileCollector:
         self._total_op_compile_time = 0.0
 
 
+class SynchronizeStreamCollector:
+
+    def __init__(self):
+        self._synchronize_stream_count = 0
+        self._slow_synchronize_stream = []
+        self.rule = SynchronizeStreamCollector._load_rule()
+
+    @property
+    def total_count(self):
+        return self._synchronize_stream_count
+
+    @property
+    def slow_synchronize_stream(self):
+        return self._slow_synchronize_stream
+
+    @staticmethod
+    def _load_rule():
+        sync_stream_rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "rules",
+                                             "synchronize.yaml")
+
+        sync_stream_rule = FileManager.read_yaml_file(sync_stream_rule_path)
+        return sync_stream_rule
+
+    def update_sync_stream_count(self):
+        self._synchronize_stream_count += 1
+
+    def append_slow_sync_stream(self, event):
+        if float(event.dur) / 1000 >= self.rule.get("slow_synchronize_threshold", 10):
+            self._slow_synchronize_stream.append(event)
+
+    def unset(self):
+        self._synchronize_stream_count = 0
+        self._slow_synchronize_stream = []
+
+
 @singleton
-class TimelineEventDataset(Dataset):
+class TimelineEventDataset:
 
-    def __init__(self, collection_path, data: dict, **kwargs) -> None:
+    def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None:
         self._ops_with_task_type = {}
         self._ops_with_stack = {}
         self._ops_compile = OpCompileCollector()
         self._torch_to_npu = {}
         self._acl_to_npu = set()
-        self._aten: List[str] = []
-        self._optimizer: List[str] = []
+        self._aten: List[Any] = []
+        self._optimizer: List[Any] = []
+        self._dataloader: List[Any] = []
+        self._sync_batchnorm: List[Any] = []
+        self._synchronize_stream = SynchronizeStreamCollector()
         self.timeline_dir = collection_path
-        self.timeline_data_list = get_file_path_from_directory(collection_path, lambda file: file.endswith("trace_view.json"))
+        self.timeline_data_list = get_file_path_from_directory(collection_path,
+                                                               lambda file: file.endswith("trace_view.json"))
         self.dataset_len = None
         self.analysis_mode = kwargs.get("analysis_mode")
         self.task_type = kwargs.get("task_type")
-        self.cann_version = kwargs.get("cann_version")
-        self.torch_version = kwargs.get("torch_version")
 
-        if self.analysis_mode in ["fusion_ops", "all"]:
-            logger.info("Load fusion operators database for cann version '%s' and torch version  '%s'",
-                        self.cann_version, self.torch_version)
+        if not build_dataset:
+            return
 
-        super().__init__(collection_path, data)
+        if self.parse():
+            key = self.get_key()
+            if key not in data:
+                data[key] = []
+            data[key].append(self)
 
         if self.analysis_mode in ["op_stack", "all"]:
             self._task_op_names = list(set([event_key.split("-")[0] for event_key in self._ops_with_task_type.keys()]))
 
         self._post_process()
 
-
     @property
     def ops_with_stack(self):
         return self._ops_with_stack
@@ -102,36 +143,60 @@ class TimelineEventDataset(Dataset):
     def aten(self):
         return self._aten
 
-    def _parse(self):
+    @property
+    def dataloader(self):
+        return self._dataloader
+
+    @property
+    def sync_batchnorm(self):
+        return self._sync_batchnorm
+
+    @property
+    def synchronize_stream(self):
+        return self._synchronize_stream
+
+    @classmethod
+    def get_key(cls):
+        """
+        get key of dataset
+        :return: key
+        """
+        return cls.__module__.rsplit('.', maxsplit=1)[-1]
+
+    def parse(self):
 
         if len(self.timeline_data_list) == 0:
             logger.warning("Please ensure trace_view.json in %s, skip timeline analysis.", self.timeline_dir)
             return False
 
         if len(self.timeline_data_list) > 1:
-            logger.warning("Please ensure only one trace_view.json in %s, there will analyze first timeline profiling data.", self.timeline_dir)
-            self.timeline_data_list = [self.timeline_data_list[0]]
+            logger.warning("Found multiple trace_view.json in %s, load the file of device 0 for analysis  .",
+                           self.timeline_dir)
 
         result = self.parse_data_with_generator(self._add_event)
 
         if not self.dataset_len:
             self.dataset_len = len(result)
-
         return True
 
     def parse_data_with_generator(self, func):
         result = []
+        timeline_data_path = sorted(self.timeline_data_list)[0]
+        if not check_path_valid(timeline_data_path):
+            return result
+
         try:
-            json_content = FileManager.read_json_file(self.timeline_data_list[0])
-            for i, event in tqdm(enumerate(json_content), leave=False, ncols=100,
-                                 desc="Building dataset for timeline analysis",
-                                 total=self.dataset_len):
-                func_res = func(index=i, event=event)
-                if func_res:
-                    result.append(func_res)
-        except Exception as e:
-            logger.warning("Error %s while parsing file %s, continue to timeline analysis", e,
-                           self.timeline_data_list[0])
+            with open(timeline_data_path, "r") as f:
+                for i, event in tqdm(enumerate(ijson.items(f, "item")),
+                                     leave=False, ncols=100, desc="Building dataset for timeline analysis",
+                                     total=self.dataset_len):
+                    func_res = func(index=i, event=event)
+                    if func_res is not None:
+                        result.append(func_res)
+
+        except Exception:
+            logger.warning("Error %s while parsing file %s, continue to timeline analysis", traceback.format_exc(),
+                           timeline_data_path)
         return result
 
     def _add_ops_with_task_type(self, event):
@@ -169,12 +234,40 @@ class TimelineEventDataset(Dataset):
             "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur
         }))
 
+    def _add_dataloader(self, event: TimelineEvent):
+        if "dataloader" in event.name.lower():
+            self._dataloader.append(TimelineEvent({
+                "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur,
+                "stack": event.args.get("Call stack")
+            }))
+
+    def _add_sync_batchnorm(self, event: TimelineEvent):
+        if event.name.lower() == "syncbatchnorm":
+            self._sync_batchnorm.append(TimelineEvent({
+                "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur
+            }))
+
+    def _add_synchronize(self, event: TimelineEvent):
+        if event.name.startswith(const.SYNC_STREAM):
+            self._synchronize.append(TimelineEvent({
+                "name": event.name, "ts": event.ts, "dur": event.dur
+            }))
+
+    def _add_specific_operator(self, event):
+        # for analysis of operator aclOpCompile, enable jit_compILE=False
+        self._add_op_compile(event)
+        # for analysis of slow dataloader.__next__
+        self._add_dataloader(event)
+        # for analysis of syncBatchNorm operator, prompt users to replace source code of torch_npu's syncbn
+        self._add_sync_batchnorm(event)
+
     def _add_event(self, index, event):
         event["dataset_index"] = index
         if not isinstance(event, TimelineEvent):
             event = TimelineEvent(event)
 
-        self._add_op_compile(event)
+        self._add_specific_operator(event)
+
         if self.analysis_mode == "fusion_ops":
             self._add_event_for_fusion_ops(event)
         elif self.analysis_mode == "op_stack":
@@ -190,6 +283,10 @@ class TimelineEventDataset(Dataset):
             self._add_aten(event)
             return
 
+        # 检查cann层同步操作，根据时间窗口索引到host侧的aten算子并给出堆栈
+        if event.name.startswith(const.SYNC_STREAM):
+            self._add_aten(event)
+
         if event.name.startswith(f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"):
             self._add_optimizer(event)
             return
@@ -215,7 +312,18 @@ class TimelineEventDataset(Dataset):
         # eliminate sub aten operator of the first level aten operator by 'ts' and 'dur',
         # keep the first level aten operator contiguous
         formated_atens = []
-        for aten_event in sorted(self._aten, key=lambda x: x.get("ts", -1)):
-            if not formated_atens or not formated_atens[-1].ts_include(aten_event):
-                formated_atens.append(aten_event)
+        for event in sorted(self._aten, key=lambda x: x.get("ts", -1)):
+            if event.name.startswith(const.ATEN):
+                if not formated_atens or not formated_atens[-1].ts_include(event):
+                    formated_atens.append(event)
+
+            elif event.name.startswith(const.SYNC_STREAM):
+                self._synchronize_stream.update_sync_stream_count()
+                if formated_atens[-1].ts_include(event):
+                    # 使用aten算子的索引，用于查询堆栈
+                    event["dataset_index"] = formated_atens[-1].get("dataset_index")
+                    self._synchronize_stream.append_slow_sync_stream(event)
+
+            else:
+                continue
         self._aten = formated_atens
diff --git a/profiler/advisor/display/html/templates/ai_core_frequency.html b/profiler/advisor/display/html/templates/ai_core_frequency.html
new file mode 100644
index 0000000000000000000000000000000000000000..d04514203733b445ecb6ce2b69435ce5a86e353d
--- /dev/null
+++ b/profiler/advisor/display/html/templates/ai_core_frequency.html
@@ -0,0 +1,27 @@
+{% if data|length > 0 %}
+<div class="collapsible">
+      <h2 class="collapsible-header">AI CORE Frequency Issues</h2>
+      <div class="collapsible-content">
+            <a style="font-weight: bold" id="timeline_api_instruction_issue">Issue: {{ desc }}</a>
+            <br>
+            <a style="font-weight: bold" id="timeline_api_suggestion">Suggestion: {{ suggestion }}</a>
+            <br><br>
+            <table>
+                <tr>
+                {% for header in headers %}
+                    <th> {{ header }} </th>
+                {% endfor %}
+                </tr>
+
+                {% for row in data %}
+                <tr>
+                    {% for element in row %}
+                    <td>{{ element|safe }}</td>
+                    {% endfor %}
+                </tr>
+                {% endfor %}
+            </table>
+
+        </div>
+</div>
+{% endif %}
\ No newline at end of file
diff --git a/profiler/advisor/display/html/templates/slow_dataloader.html b/profiler/advisor/display/html/templates/slow_dataloader.html
new file mode 100644
index 0000000000000000000000000000000000000000..ae3a22f283c9b2a2123d9252432f01446f94803d
--- /dev/null
+++ b/profiler/advisor/display/html/templates/slow_dataloader.html
@@ -0,0 +1,18 @@
+<div class="collapsible">
+  <h2 class="collapsible-header" style="background-color: {{ priority_background_color }};">Slow Dataloader Issues</h2>
+  <div class="collapsible-content">
+    <a style="font-weight: bold" id="timeline_api_instruction">{{ desc }}</a>
+    <table>
+        <tr>
+            <th>Suggestions</th>
+        </tr>
+
+        {% for suggestion in suggestions %}
+            <tr>
+                <td>{{ loop.index }}. {{ suggestion|safe }}</td>
+            </tr>
+        {% endfor %}
+    </table>
+
+  </div>
+</div>
diff --git a/profiler/advisor/display/html/templates/sync_batchnorm.html b/profiler/advisor/display/html/templates/sync_batchnorm.html
new file mode 100644
index 0000000000000000000000000000000000000000..0a4cb3e73021843dcef340934b214af0f23ead1a
--- /dev/null
+++ b/profiler/advisor/display/html/templates/sync_batchnorm.html
@@ -0,0 +1,30 @@
+
+<div class="collapsible">
+    <h2 class="collapsible-header" style="background-color: {{ priority_background_color }};">SyncBatchNorm Issues</h2>
+    <div class="collapsible-content">
+        <a style="font-weight: bold" id="timeline_api_instruction">{{ desc }}</a>
+        <table>
+        <tr>
+            <th>Suggestions</th>
+        </tr>
+        {% for item in solutions %}
+            {% set rowloop = loop %}
+            {% for key, value in item.items() %}
+                <tr>
+                    <td>{{  rowloop.index }}. {{ value.desc }}</td>
+                </tr>
+            {% endfor %}
+        {% endfor %}
+        </table>
+
+        <a style="font-weight: bold" id="timeline_api_instruction"> More efficient code of syncbn forward as follows:</a>
+        {% for item in solutions %}
+            {% for key, value in item.items() %}
+                {% if 'efficient_code' in value %}
+                    <pre><code class="language-python">{{ value.efficient_code|safe }}</code></pre>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+
+    </div>
+</div>
diff --git a/profiler/advisor/display/html/templates/synchronize_stream.html b/profiler/advisor/display/html/templates/synchronize_stream.html
new file mode 100644
index 0000000000000000000000000000000000000000..fd95b48615100a89caa0da71efbc3e958041278e
--- /dev/null
+++ b/profiler/advisor/display/html/templates/synchronize_stream.html
@@ -0,0 +1,57 @@
+<div class="collapsible">
+  <h2 class="collapsible-header" style="background-color: {{ priority_background_color }};">Synchronize Stream Issues</h2>
+  <div class="collapsible-content">
+      <a style="font-weight: bold" id="timeline_api_instruction">{{ desc }}</a>
+
+         <table>
+            <tr>
+                <th>Suggestions</th>
+            </tr>
+
+            {% for item in solutions %}
+                {% set rowloop = loop %}
+                {% for key, value in item.items() %}
+
+                <tr>
+                    <td>{{  rowloop.index }}. {{ value.desc }}</td>
+                </tr>
+                {% endfor %}
+            {% endfor %}
+         </table>
+
+      <div class="collapsible">
+        {% if not empty_stacks %}
+        <a style="font-weight: bold" id="timeline_api_instruction">Please click on the collapsible box below to view the detailed code stack that triggers synchronizeStream</a>
+        {% elif not framework_black_list %}
+        <a style="font-weight: bold" id="timeline_api_instruction">Suggestion: </a>
+        <a>These operators have no code stack. If parameter 'with_stack=False' was set while profiling, please refer to</a>
+        <a href="{{with_stack_doc_url|safe}}" target="_blank">Ascend PyTorch Profiler</a> to set
+            <span style="font-weight:bold;color:red;">'with_stack=True'</span>. Otherwise, ignore following affinity APIs due to backward broadcast lack of stack.
+        {% endif %}
+
+      {% for api_name, stacks in result.items() %}
+
+        {% if empty_stacks %}
+        <div class="non-stack-api-box">{{api_name|safe}}</div>
+
+        {% elif stacks | length > 0 %}
+
+        <div class="collapsible-header">{{api_name|safe}}</div>
+        <div class="collapsible-content">
+          <div class="collapsible">
+          {% for stack in stacks %}
+              <div class="collapsible-header">No.{{loop.index|safe}} code stack, called {{stack[1]|safe}} times</div>
+              <div class="collapsible-content">
+                <a id="timeline_api_stack">{{stack[0]|safe}}</a>
+              </div>
+          {% endfor %}
+          </div>
+        </div>
+        {% endif %}
+
+      {% endfor %}
+
+      </div>
+
+  </div>
+</div>
diff --git a/profiler/advisor/img/overall.png b/profiler/advisor/img/overall.png
index 6d5da107a3f7f6c8c655922bd80d193708fe71aa..1883d4c97388b1cfb774d05fc9e0d368d0c66901 100644
Binary files a/profiler/advisor/img/overall.png and b/profiler/advisor/img/overall.png differ
diff --git a/profiler/advisor/img/overall_0.png b/profiler/advisor/img/overall_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..f74cf2dcf131f36df9901e20ea327d509c6fee67
Binary files /dev/null and b/profiler/advisor/img/overall_0.png differ
diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py
index 59bfee77f60c24194cc3f392fc9c557d0f1ed70a..1d3872a1783111af7b1f543241da6b23fb14a632 100644
--- a/profiler/advisor/interface/interface.py
+++ b/profiler/advisor/interface/interface.py
@@ -13,23 +13,31 @@ from profiler.advisor.analyzer.cluster.slow_rank_analyser import SlowRankAnalyze
 from profiler.advisor.analyzer.cluster.slow_link_analyser import SlowLinkAnalyzer
 from profiler.advisor.analyzer.overall.overall_summary_analyzer import OverallSummaryAnalyzer
 from profiler.advisor.analyzer.schedule.dispatch.timeline_op_dispatch_analyzer import OpDispatchAnalyzer
+from profiler.advisor.analyzer.schedule.syncbn.syncbn_analyzer import SyncBNAnalyzer
+from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_analyzer import SynchronizeStreamAnalyzer
+from profiler.advisor.analyzer.dataloader.dataloader_analyzer import DataloaderAnalyzer
+from profiler.advisor.analyzer.computation.ai_core_freq.ai_core_freq_analyzer import AICoreFreqAnalyzer
+
 
 class Interface:
     supported_analyzer = {
         "schedule": OrderedDict({
-            SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer,
-            SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer
+            SupportedScopes.SYNCBN: SyncBNAnalyzer,
+            SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer,
+            SupportedScopes.SYNCHRONIZE_STREAM: SynchronizeStreamAnalyzer,
+            SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer
         }),
         "computation": OrderedDict({
             SupportedScopes.DYNAMIC_SHAPE_ANALYSIS: DynamicShapeAnalyzer,
             SupportedScopes.AICPU_ANALYSIS: AicpuAnalyzer,
             SupportedScopes.OPERATOR_NO_BOUND_ANALYSIS: OperatorBoundAnalyzer,
             SupportedScopes.BLOCK_DIM_ANALYSIS: BlockDimAnalyzer,
-            SupportedScopes.GRAPH: FusionOPAnalyzer
+            SupportedScopes.GRAPH: FusionOPAnalyzer,
+            SupportedScopes.FREQ_ANALYSIS: AICoreFreqAnalyzer
         }),
         "communication": OrderedDict(),
         "overall": OrderedDict({SupportedScopes.OVER_ALL: OverallSummaryAnalyzer}),
-        "dataloader": OrderedDict(),
+        "dataloader": OrderedDict({SupportedScopes.DATALOADER: DataloaderAnalyzer}),
         "cluster": OrderedDict({
             SupportedScopes.SLOW_RANK: SlowRankAnalyzer,
             SupportedScopes.SLOW_LINK: SlowLinkAnalyzer
@@ -66,7 +74,7 @@ class Interface:
         if render_html and result.data:
             if hasattr(analyzer, "html_render"):
                 analyzer.html_render.render_html()
-            analyzer.html_render.save_to_file(f'att_advisor_{Timer().strftime}.html')
+            analyzer.html_render.save_to_file(f'mstt_advisor_{Timer().strftime}.html')
 
         return result if not output_dict else dict(result.data)
 
diff --git a/profiler/advisor/result/item.py b/profiler/advisor/result/item.py
index fa0ffb5b1c769dd5e7a0d69523d0c94a65ffaf19..02db7fdd0044e480ff7af524c4ba8ee34ee45a38 100644
--- a/profiler/advisor/result/item.py
+++ b/profiler/advisor/result/item.py
@@ -15,7 +15,7 @@ class OptimizeItem:
 
     @property
     def headers(self):
-        return ["problem", "description", "suggestion"]
+        return ["category", "description", "suggestion"]
 
 
 class StatisticsItem:
diff --git a/profiler/advisor/result/result.py b/profiler/advisor/result/result.py
index c7d7da8663c8f0105734ec211e2a55a988030465..0d0602ee56c2090cef9833cfd1ac594cd8d36169 100644
--- a/profiler/advisor/result/result.py
+++ b/profiler/advisor/result/result.py
@@ -93,6 +93,9 @@ class SheetRecoder:
         if data not in self._sheet_data[sheet_name]["data"]:
             self._sheet_data[sheet_name]["data"].append(data)
 
+    def clear(self):
+        self._sheet_data.clear()
+
 
 @singleton
 class OptimizeResult:
@@ -110,12 +113,12 @@ class OptimizeResult:
     def add_tune_op_list(self, tune_op_list) -> None:
         """
         add tune op name to tune op list
-        :param tune_op_list: tune op name list to be added
+        :param tune_op_list: list of operators to be optimized
         :return: None
         """
-        for op_name in tune_op_list:
-            if op_name not in self._tune_op_list:
-                self._tune_op_list.append(op_name)
+        for operator_name in tune_op_list:
+            if operator_name not in self._tune_op_list:
+                self._tune_op_list.append(operator_name)
 
     def add(self, overview_item):
         sheet_name = "problems"
@@ -148,6 +151,9 @@ class OptimizeResult:
         logger.info("Save problems details file to %s", Config().analysis_result_file)
         self._save_op_file_list()
 
+    def clear(self) -> None:
+        self.data.clear()
+
     def _save_op_file_list(self) -> None:
         if not self._tune_op_list:
             return
@@ -173,9 +179,9 @@ class TerminalResult:
     def __init__(self):
         self.width, _ = self.get_terminal_size()
         if self.width is None:
-            self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"])
+            self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"])
         else:
-            self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"],
+            self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"],
                                      max_table_width=max(self.width - 20, 180))
         self.table.hrules = ALL
         self.result_list = []
diff --git a/profiler/advisor/rules/dataloader.yaml b/profiler/advisor/rules/dataloader.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bb7a4c0e70a57b125dcdc6c793cedd50dd09228
--- /dev/null
+++ b/profiler/advisor/rules/dataloader.yaml
@@ -0,0 +1,9 @@
+# unit is milliseconds
+dataloader_duration_threshold: 10
+problem: "Found slow dataloader, cost {dataloader_duration} milliseconds for one step while profiling, normally less than {dataloader_duration_threshold} milliseconds."
+solutions:
+   - "Please check the disk I/O of your data directory. If you are training model in ModelArts, please move data to '/cache' or mount a more efficient cloud disk for better I/O."
+   - "Please check if there are any other multiprocess operations in runtime that may have affected the dataloader, such as training process core binding command 'taskset ...' used for launching the training job."
+   - "Please check the format of your data, avoid file format like tar, tar.gz, zip."
+   - "Please set 'pin_memory=True' for your dataloader."
+   - "Try to adjust dataloader parameter 'num_workers'."
\ No newline at end of file
diff --git a/profiler/advisor/rules/sync_batchnorm.yaml b/profiler/advisor/rules/sync_batchnorm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d65bcb0d4a16c005e7d85979dc60ec1d05e19766
--- /dev/null
+++ b/profiler/advisor/rules/sync_batchnorm.yaml
@@ -0,0 +1,41 @@
+problem: "Found {syncbn_num} SyncBatchNorm, which can lead to slow python task dispatch and frequent communication between devices and finally reducing training efficiency."
+max_syncbn_num: 20
+solutions:
+  - enable batchnorm:
+      desc: "disable SyncBatchNorm by remove the code like 'torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)' if possible."
+  - enable efficient SyncBatchNorm:
+      desc: "replace the 'forward' method of python script 'torch_npu/utils/syncbatchnorm.py' in your runtime environment."
+      efficient_code: |
+         @staticmethod
+         def forward(self, input_tensor, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size):
+             input_tensor = input_tensor.contiguous()
+             input_shape = input_tensor.shape
+             input_tensor_ = input_tensor.reshape(input_shape[0], input_shape[1], 1, -1)
+             sum_val, sum_square_val = torch.batch_norm_reduce(input_tensor_, eps)
+
+             count = torch.full((1,),
+                                input_tensor.numel() // input_tensor.size(1),
+                                dtype=sum_val.dtype,
+                                device=sum_val.device)
+
+             num_channels = input_tensor.shape[1]
+             combined = torch.cat([sum_val, sum_square_val, count], dim=0)
+             combined_list = torch.empty((world_size,) + combined.shape, dtype=combined.dtype, device=combined.device)
+             dist.all_gather_togather(combined_list, combined, process_group, async_op=False)
+             sum_all, square_sum_all, count_all = torch.split(combined_list, num_channels, dim=1)
+             size = count_all.view(-1).sum()
+             if size == 1:
+                 raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
+
+             mean, invstd = torch.batch_norm_gather_stats_update(input_tensor,
+                                                                 sum_all,
+                                                                 square_sum_all,
+                                                                 running_mean,
+                                                                 running_var,
+                                                                 momentum,
+                                                                 eps,
+                                                                 count_all.view(-1))
+             self.save_for_backward(input_tensor, weight, mean, invstd, count_all.to(torch.int32))
+             self.process_group = process_group
+             out = torch.batch_norm_elemt(input_tensor, weight, bias, mean, invstd, eps)
+             return out
\ No newline at end of file
diff --git a/profiler/advisor/rules/synchronize.yaml b/profiler/advisor/rules/synchronize.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed105b345c624d3a8dba89e9eaa29ab8b0d89692
--- /dev/null
+++ b/profiler/advisor/rules/synchronize.yaml
@@ -0,0 +1,8 @@
+problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream, {slow_synchronize_num} slow SynchronizeStream cost {total_synchronize_stream_time} us."
+max_synchronize_num: 20
+slow_synchronize_threshold: 10 #ms
+solutions:
+  - disable ascend launch blocking:
+      desc: "please check your env 'ASCEND_LAUNCH_BLOCKING', if ASCEND_LAUNCH_BLOCKING=1, please execute 'unset ASCEND_LAUNCH_BLOCKING' and then start your training job."
+  - modify code to avoid synchronize stream:
+      desc: "please try to modify your training code to avoid synchronize stream between cpu and npu."
\ No newline at end of file
diff --git a/profiler/advisor/utils/utils.py b/profiler/advisor/utils/utils.py
index 84419b67087f8a434361f77479899d10ef91b9f5..83f304c2d3c7d2e583b9c3979a4cf2c232020f55 100644
--- a/profiler/advisor/utils/utils.py
+++ b/profiler/advisor/utils/utils.py
@@ -1,5 +1,6 @@
 import inspect
 import json
+
 import logging
 import multiprocessing as mp
 import os
@@ -11,7 +12,7 @@ import traceback
 import types
 from functools import wraps
 from typing import Any, Set
-
+import ijson
 import click
 import requests
 from requests.adapters import HTTPAdapter
@@ -43,7 +44,7 @@ class ContextObject(object):
 
 
 def debug_option(f):
-    return click.option('--debug', '-D',
+    return click.option('--debug',
                         is_flag=True,
                         expose_value=False,
                         is_eager=True,
@@ -413,7 +414,17 @@ def format_excel_title(title: str) -> str:
     title = title.replace("(ns)", '')
     title = title.replace("(%)", '')
     title = title.replace(" ", "_")
-    return title
+
+    # 将kernel_details中的列名转为与op_summary_x.csv中一致
+    kernel_details_col_name_map = {
+        "name": "op_name",
+        "type": "op_type",
+        "accelerator_core": "task_type",
+        "start_time": "task_start_time",
+        "duration": "task_duration",
+        "wait_time": "wait_time"
+    }
+    return kernel_details_col_name_map.get(title, title)
 
 
 def format_float(num: float) -> float:
@@ -550,3 +561,50 @@ def get_file_path_by_walk(root, filename):
                 file_path = os.path.join(root, name)
                 return file_path
     return file_path
+
+
+def check_path_valid(path):
+    if os.path.islink(os.path.abspath(path)):
+        logger.error("fThe path is detected as a soft connection. path:%ss", path)
+        return False
+    elif not os.access(path, os.R_OK):
+        logger.error(f"The file is not readable. path:%ss", path)
+        return False
+    elif os.path.getsize(path) > const.MAX_FILE_SIZE:
+        logger.error(f"The file size exceeds the limit. path:%ss, MAX_FILE_SIZE:%ss B",path, const.MAX_FILE_SIZE)
+        return False
+    return True
+
+
+def parse_json_with_generator(timeline_data_path, func):
+    result = []
+    if not check_path_valid(timeline_data_path):
+        return result
+    try:
+        with open(timeline_data_path, "r") as f:
+            if os.getenv(const.DISABLE_STREAMING_READER) == "1":
+                logger.debug("Disable streaming reader.")
+                file_parser = json.loads(f.read())
+            else:
+                logger.debug("Enable streaming reader.")
+                file_parser = ijson.items(f, "item")
+
+            for i, event in tqdm(enumerate(file_parser),
+                                 leave=False, ncols=100, desc="Building dataset for timeline analysis"):
+                func_res = func(index=i, event=event)
+                if func_res is not None:
+                    result.append(func_res)
+
+    except Exception:
+        logger.warning("Error %s while parsing file %s, continue to timeline analysis", traceback.format_exc(),
+                       timeline_data_path)
+    return result
+
+
+def convert_to_float(num):
+    try:
+        return float(num)
+    except (ValueError, FloatingPointError):
+        logger.error(f"Can not convert %ss to float", num)
+        pass
+    return 0
diff --git a/profiler/cli/__init__.py b/profiler/cli/__init__.py
index eab13571c58756cc978ebc59479c86c0d1e85529..e768e4cb86c379f1618f1eb406914678eab47db8 100644
--- a/profiler/cli/__init__.py
+++ b/profiler/cli/__init__.py
@@ -1,4 +1,4 @@
 from profiler.advisor.config.config import Config
 from profiler.advisor.utils.utils import Timer
 
-Config().set_log_path(f"att_advisor_{Timer().strftime}.xlsx")
+Config().set_log_path(f"mstt_advisor_{Timer().strftime}.xlsx")
diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py
index 2e173dc87086a1335f49f9685b928708089aa1ea..f400a265b7bfcab1e5f19513a3eea43fea5250ce 100644
--- a/profiler/cli/analyze_cli.py
+++ b/profiler/cli/analyze_cli.py
@@ -83,9 +83,6 @@ def analyze_cli(**kwargs):
               help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof")
 @debug_option
 def analyze_all(**kwargs) -> None:
-    # 当前compare_tools必须输入两个profiling路径，att-advisor有等价功能支持输入一个Profiling路径，后续替换成对应实现
-    if not kwargs.get("benchmark_profiling_path"):
-        kwargs["benchmark_profiling_path"] = kwargs.get("profiling_path")
     try:
         _analyze(Interface.all_dimension, **kwargs)
     except RuntimeError as e:
diff --git a/profiler/cli/compare_cli.py b/profiler/cli/compare_cli.py
index e794578da8c37db4f825532a20c802f162bcb066..f9add948ea9da115ab785877a26b890329771f1b 100644
--- a/profiler/cli/compare_cli.py
+++ b/profiler/cli/compare_cli.py
@@ -32,6 +32,8 @@ from profiler.compare_tools.compare_backend.comparison_generator import Comparis
 @click.option('--enable_operator_compare', is_flag=True)
 @click.option('--enable_memory_compare', is_flag=True)
 @click.option('--enable_communication_compare', is_flag=True)
+@click.option('--enable_api_compare', is_flag=True)
+@click.option('--enable_kernel_compare', is_flag=True)
 @click.option('--disable_details', is_flag=True)
 @click.option('--output_path', '-o', 'output_path', type=click.Path())
 @click.option('--max_kernel_num', 'max_kernel_num', type=int, help="The number of kernels per torch op is limited.")
diff --git a/profiler/cluster_analyse/README.md b/profiler/cluster_analyse/README.md
index fdd43ca965fe17edf9b565d05bf12cb68bff8d71..8ea2b7790d194051ea68b45e9548e4e2af40cdab 100644
--- a/profiler/cluster_analyse/README.md
+++ b/profiler/cluster_analyse/README.md
@@ -145,8 +145,8 @@ L列：Preparing，指迭代开始到首个计算或通信算子运行的时间
 **Tips**：可以根据rank互联的带宽以及链路类型，判断是否有慢链路的问题。
 
 - "LOCAL"是片内拷贝，速率非常快，不需要考虑。
-- “HCCS”或“PCIE”是节点内片间拷贝，速度在18GB左右或以上比较正常。
-- “RDMA”是节点间拷贝，910A速度在12GB左右或以上。
+- “HCCS”或“PCIE”是节点内片间拷贝。
+- “RDMA”是节点间拷贝。
 
 #### cluster_communication.json
 
diff --git a/profiler/cluster_analyse/common_func/file_manager.py b/profiler/cluster_analyse/common_func/file_manager.py
index e7e2d5adca37faf5b377bcbe720fdfba84311eca..380192f87befa1ce10e83502af34fe9a6b24ea67 100644
--- a/profiler/cluster_analyse/common_func/file_manager.py
+++ b/profiler/cluster_analyse/common_func/file_manager.py
@@ -17,6 +17,8 @@ import os
 import csv
 import json
 
+import yaml
+
 from common_func.constant import Constant
 from common_func.path_manager import PathManager
 
@@ -60,6 +62,23 @@ class FileManager:
             raise RuntimeError(f"Failed to read the file: {base_name}") from e
         return result_data
 
+    @classmethod
+    def read_yaml_file(cls, file_path: str) -> dict:
+        PathManager.check_path_readable(file_path)
+        base_name = os.path.basename(file_path)
+        file_size = os.path.getsize(file_path)
+        if file_size <= 0:
+            return {}
+        if file_size > Constant.MAX_JSON_SIZE:
+            raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.")
+
+        try:
+            with open(file_path, "r") as yaml_file:
+                result_data = yaml.safe_load(yaml_file)
+        except Exception as e:
+            raise RuntimeError(f"Failed to read the file: {base_name}") from e
+        return result_data
+
     @classmethod
     def create_csv_file(cls, profiler_path: str, data: list, file_name: str, headers: list = None) -> None:
         if not data:
diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md
index 78ea5d8971722ec7d4f2b3ba624e66aa0bc33076..b40f19e92fa130e896b69c6f59889756c518d9ff 100644
--- a/profiler/compare_tools/README.md
+++ b/profiler/compare_tools/README.md
@@ -145,6 +145,8 @@ python performance_compare.py [基准性能数据文件所在路径] [比对性
 | --enable_operator_compare      | 开启算子性能比对。MindSpore场景暂不支持。该开关较耗时，建议只采集一个step的性能数据。 | 否       |
 | --enable_communication_compare | 开启通信性能比对。                                           | 否       |
 | --enable_memory_compare        | 开启算子内存比对。MindSpore场景暂不支持。该开关较耗时，建议只采集一个step的性能数据。 | 否       |
+| --enable_kernel_compare        | 开启kernel性能比对。仅针对NPU与NPU比对的场景。需要使用性能数据中的kernel_details.csv文件。 | 否       |
+| --enable_api_compare           | 开启API性能比对。需要使用性能数据中的trace_view.csv文件。    | 否       |
 | --disable_details              | 隐藏明细比对，只进行统计级比对。                             | 否       |
 
 说明：以上开关均不设置的情况下，**工具默认开启所有的性能比对**，当用户设置了以上开关，则按照用户设置的开关进行性能比对，示例如下：
@@ -174,9 +176,13 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文
 
 MindSpore场景仅支持**总体性能**和**通信性能**的对比。
 
+比对结果分为打屏和performance_comparison_result_{timestamp}.csv两种形式输出，其中打屏输出为概要信息，csv文件保存详细结果。
+
 ### 总体性能
 
-总体性能比对结果以打屏的形式呈现。
+#### 打屏结果
+
+总体性能比对结果以打屏的形式呈现时，字段如下：
 
 | 字段                                    | 说明                                                         |
 | --------------------------------------- | ------------------------------------------------------------ |
@@ -196,6 +202,54 @@ MindSpore场景仅支持**总体性能**和**通信性能**的对比。
 | E2E Time(Not minimal profiling)         | E2E总耗时，计算流端到端耗时。当存在Not minimal profiling时，表示该时间存在性能膨胀，会影响通信和调度耗时。 |
 | Other Time                              | AI CPU、DSA、TensorMove等其他算子耗时。                      |
 
+#### csv文件结果
+
+总体性能比对结果在performance_comparison_result_*.xlsx中OverallMetrics的sheet页呈现时，示例如下：
+
+![OverallMetrics](./img/OverallMetrics.png)
+
+表头字段说明：
+
+| 字段           | 说明                        |
+| -------------- | --------------------------- |
+| Index          | 指标。                      |
+| Duration(ms)   | 执行耗时，单位ms。          |
+| Duration Ratio | 执行耗时占E2E总耗时的比例。 |
+| Number         | 计算算子的数量。            |
+
+Index列字段说明：
+
+| 字段                         |                    |                                     | 说明                                                         |
+| ---------------------------- | ------------------ | ----------------------------------- | ------------------------------------------------------------ |
+| Computing Time               |                    |                                     | 计算流耗时，计算流所有event耗时总和。如果有多条并发计算，计算流耗时对重叠部分只会计算一次。 |
+|                              | Flash Attention    |                                     | Flash Attention算子。                                        |
+|                              |                    | Flash Attention (Forward) (Cube)    | Flash Attention前向算子下发的所有Cube类Kernel的总耗时，一般为执行该算子核心计算的算子。 |
+|                              |                    | Flash Attention (Forward) (Vector)  | Flash Attention前向算子下发的所有Vector类Kernel的总耗时，一般为插入的转换类算子，如TransData。 |
+|                              |                    | Flash Attention (Backward) (Cube)   | Flash Attention反向算子下发的所有Cube类Kernel的总耗时，一般为执行该算子核心计算的算子。 |
+|                              |                    | Flash Attention (Backward) (Vector) | Flash Attention反向算子下发的所有Vector类Kernel的总耗时，一般为插入的转换类算子，如TransData。 |
+|                              | Conv               |                                     | Conv算子。                                                   |
+|                              |                    | Conv (Forward) (Cube)               | Conv前向算子下发的所有Cube类Kernel的总耗时，一般为执行该算子核心计算的算子。 |
+|                              |                    | Conv (Forward)  (Vector)            | Conv前向Vector算子。Conv前向算子下发的所有Vector类Kernel的总耗时，一般为插入的转换类算子，如TransData。 |
+|                              |                    | Conv (Backward) (Cube)              | Conv反向算子下发的所有Cube类Kernel的总耗时，一般为执行该算子核心计算的算子。 |
+|                              |                    | Conv (Backward) (Vector)            | Conv反向算子下发的所有Vector类Kernel的总耗时，一般为插入的转换类算子，如TransData。 |
+|                              | Matmul             |                                     | Matmul算子。                                                 |
+|                              |                    | Matmul (Cube)                       | Matmul算子下发的所有Cube类Kernel的总耗时，一般为执行该算子核心计算的算子。 |
+|                              |                    | Matmul (Vector)                     | Matmul算子下发的所有Vector类Kernel的总耗时，一般为插入的转换类算子，如TransData。 |
+|                              | Paged Attention    |                                     | Paged Attention算子。                                        |
+|                              | Vector             |                                     | Vector算子。                                                 |
+|                              |                    | Vector (Trans)                      | 转换类Vector算子，主要包含Cast、TransPose、TransData算子。（仅针对NPU数据） |
+|                              |                    | Vector ( No Trans)                  | 非转换类Vector算子。                                         |
+|                              | Cube               |                                     | 未识别出Flash Attention、Conv和Matmul的Cube算子。            |
+|                              | SDMA (Tensor Move) |                                     | 拷贝类任务。                                                 |
+|                              | Other              |                                     | AI CPU、DSA等其他算子。                                      |
+| Uncovered Communication Time |                    |                                     | 通信未掩盖耗时，包含卡间等待时间。                           |
+|                              | Wait               |                                     | 卡间同步等待耗时。（仅针对NPU数据）                          |
+|                              | Transmit           |                                     | 通信传输耗时。                                               |
+| Free Time                    |                    |                                     | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间，因此包含拷贝时间（SDMA Time）。 |
+|                              | SDMA               |                                     | NPU为除Tensor Move外的拷贝类任务，GPU为所有拷贝类任务。      |
+|                              | Free               |                                     | 排除SDMA的空闲耗时。                                         |
+| E2E Time                     |                    |                                     | E2E总耗时，计算流端到端耗时。当存在Not minimal profiling时，表示该时间存在性能膨胀，会影响通信和调度耗时。 |
+
 可以采取最简性能数据采集的方式来减少E2E耗时的性能膨胀，示例代码如下：
 
 ```python
@@ -300,3 +354,29 @@ MindSpore场景暂不支持。
 
 步骤1：查看MemoryCompareStatistic页，找出内存占用差距TOP的算子。
 步骤2：查看MemoryCompare页，搜索内存占用差距TOP的算子，查看具体占用的子算子。
+
+### kernel性能
+
+仅针对NPU与NPU比对的场景。
+
+kernel比对结果在performance_comparison_result_*.xlsx中KernelCompare页呈现。
+
+按照Kernel（Kernel类型）和Input Shapes（输入Shape）分组统计，统计信息包括：
+
+- Total Duration(us)：总耗时，单位us。
+- Avg Duration(us)：平均耗时，单位us。
+- Max Duration(us)：最大耗时，单位us。
+- Min Duration(us)：最小耗时，单位us。
+- Calls：调用次数。
+
+### API性能
+
+API比对结果在performance_comparison_result_*.xlsx中ApiCompare页呈现。
+
+按照api name（API名称）组统计，统计信息包括：
+
+- Total Duration(ms)：总耗时，单位ms。
+- Self Time(ms)：Self耗时（排除掉子event），单位ms。
+- Avg Duration(ms)：平均耗时，单位ms。
+- Calls：调用次数。
+
diff --git a/profiler/compare_tools/compare_backend/comparator/api_compare_comparator.py b/profiler/compare_tools/compare_backend/comparator/api_compare_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc5810068b04e04aa935ce252ffd127380dd855e
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/comparator/api_compare_comparator.py
@@ -0,0 +1,32 @@
+from compare_backend.comparator.base_comparator import BaseComparator
+from compare_backend.utils.constant import Constant
+from compare_backend.utils.common_func import update_order_id
+
+
+class ApiCompareComparator(BaseComparator):
+    def __init__(self, origin_data: list, bean: any):
+        super().__init__(origin_data, bean)
+
+    @classmethod
+    def _aggregated_api_by_name(cls, ops: list):
+        ops_dict = {}
+        for op in ops:
+            ops_dict.setdefault(op.name, []).append(op)
+        return ops_dict
+
+    def _compare(self):
+        if not self._origin_data:
+            return
+        base_ops = self._origin_data.get(Constant.BASE_DATA, {})
+        comparison_ops = self._origin_data.get(Constant.COMPARISON_DATA, {})
+        if not base_ops or not comparison_ops:
+            return
+        base_aggregated_ops = self._aggregated_api_by_name(base_ops)
+        comparison_aggregated_ops = self._aggregated_api_by_name(comparison_ops)
+        for op_name, base_data in base_aggregated_ops.items():
+            comparsion_data = comparison_aggregated_ops.pop(op_name, [])
+            self._rows.append(self._bean(op_name, base_data, comparsion_data).row)
+        if comparison_aggregated_ops:
+            for op_name, comparison_data in comparison_aggregated_ops.items():
+                self._rows.append(self._bean(op_name, [], comparison_data).row)
+        update_order_id(self._rows)
diff --git a/profiler/compare_tools/compare_backend/comparator/kernel_compare_comparator.py b/profiler/compare_tools/compare_backend/comparator/kernel_compare_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c0f776af60f7f250dc22b084cf251733f4c47d
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/comparator/kernel_compare_comparator.py
@@ -0,0 +1,35 @@
+from compare_backend.comparator.base_comparator import BaseComparator
+from compare_backend.utils.constant import Constant
+from compare_backend.utils.common_func import update_order_id
+
+
+class KernelCompareComparator(BaseComparator):
+    def __init__(self, origin_data: list, bean: any):
+        super().__init__(origin_data, bean)
+
+    @classmethod
+    def _aggregated_kernel_by_type_and_shape(cls, kernels: dict):
+        result_dict = {}
+        for type_shape, shape_values in kernels.items():
+            for shape, kernel_data in shape_values.items():
+                kernel = [single[1] for single in kernel_data]
+                result_list = [type_shape, shape, sum(kernel), len(kernel), max(kernel), min(kernel)]
+                result_dict.setdefault(f"{type_shape}{shape}", []).extend(result_list)
+        return result_dict
+
+    def _compare(self):
+        if not self._origin_data:
+            return
+        base_kernels = self._origin_data.get(Constant.BASE_DATA, {})
+        comparison_kernels = self._origin_data.get(Constant.COMPARISON_DATA, {})
+        if not base_kernels or not comparison_kernels:
+            return
+        base_aggregated_kernels = self._aggregated_kernel_by_type_and_shape(base_kernels)
+        comparison_aggregated_kernels = self._aggregated_kernel_by_type_and_shape(comparison_kernels)
+        for type_shape, base_data in base_aggregated_kernels.items():
+            comparsion_data = comparison_aggregated_kernels.pop(type_shape, [])
+            self._rows.append(self._bean(base_data, comparsion_data).row)
+        if comparison_aggregated_kernels:
+            for _, comparison_data in comparison_aggregated_kernels.items():
+                self._rows.append(self._bean([], comparison_data).row)
+        update_order_id(self._rows)
\ No newline at end of file
diff --git a/profiler/compare_tools/compare_backend/compare_bean/api_compare_bean.py b/profiler/compare_tools/compare_backend/compare_bean/api_compare_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..55e08a86be86df1cfe5cba598336b9686af62386
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/compare_bean/api_compare_bean.py
@@ -0,0 +1,47 @@
+from compare_backend.utils.common_func import calculate_diff_ratio
+from compare_backend.utils.constant import Constant
+from compare_backend.utils.excel_config import ExcelConfig
+
+
+class ApiInfo:
+    def __init__(self, op_name: str, data_list: list):
+        self._data_list = data_list
+        self.name = op_name
+        self.total_dur = 0.0
+        self.self_time = 0.0
+        self.avg_dur = 0.0
+        self.number = len(data_list)
+        self._get_info()
+
+    def _get_info(self):
+        for data in self._data_list:
+            self.total_dur += data.api_dur
+            self.self_time += data.api_self_time
+        self.total_dur /= 1000.0
+        self.self_time /= 1000.0
+        self.avg_dur = self.total_dur / self.number if self.number else 0.0
+
+
+class ApiCompareBean:
+    TABLE_NAME = Constant.API_TABLE
+    HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME)
+    OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME)
+
+    def __init__(self, op_name: str, base_api: list, comparison_api: list):
+        self._name = op_name
+        self._base_api = ApiInfo(op_name, base_api)
+        self._comparison_api = ApiInfo(op_name, comparison_api)
+
+    @property
+    def row(self):
+        row = [None, self._name,
+               self._base_api.total_dur, self._base_api.self_time, self._base_api.avg_dur, self._base_api.number,
+               self._comparison_api.total_dur, self._comparison_api.self_time,
+               self._comparison_api.avg_dur, self._comparison_api.number]
+        diff_fields = [calculate_diff_ratio(self._base_api.total_dur, self._comparison_api.total_dur)[1],
+                       calculate_diff_ratio(self._base_api.self_time, self._comparison_api.self_time)[1],
+                       calculate_diff_ratio(self._base_api.avg_dur, self._comparison_api.avg_dur)[1],
+                       calculate_diff_ratio(self._base_api.number, self._comparison_api.number)[1]]
+        row.extend(diff_fields)
+        return row
+
diff --git a/profiler/compare_tools/compare_backend/compare_bean/kernel_compare_bean.py b/profiler/compare_tools/compare_backend/compare_bean/kernel_compare_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..df96addc4fe8f90123d02ddb70911a8e10b23eac
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/compare_bean/kernel_compare_bean.py
@@ -0,0 +1,75 @@
+from compare_backend.utils.common_func import calculate_diff_ratio
+from compare_backend.utils.constant import Constant
+from compare_backend.utils.excel_config import ExcelConfig
+
+
+class KernelCompareInfo:
+    def __init__(self, data_list: list):
+        self._kernel_type = None
+        self._input_shapes = None
+        self._total_dur = None
+        self._number = None
+        self._max_dur = None
+        self._min_dur = None
+        if not data_list:
+            return
+        self._kernel_type = data_list[0]
+        self._input_shapes = data_list[1]
+        self._total_dur = data_list[2]
+        self._number = data_list[3]
+        self._max_dur = data_list[4]
+        self._min_dur = data_list[5]
+
+    @property
+    def kernel_type(self):
+        return self._kernel_type
+
+    @property
+    def input_shapes(self):
+        return self._input_shapes
+    
+    @property
+    def total_dur(self):
+        return self._total_dur if self._total_dur else 0.0
+    
+    @property
+    def number(self):
+        return self._number
+    
+    @property
+    def max_dur(self):
+        return self._max_dur
+    
+    @property
+    def min_dur(self):
+        return self._min_dur
+    
+    @property
+    def avg_dur(self):
+        return self._total_dur / self._number if self._total_dur and self._number else 0.0
+
+
+class KernelCompareBean:
+    TABLE_NAME = Constant.KERNEL_TABLE
+    HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME)
+    OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME)
+
+    def __init__(self, base_kernel: list, comparison_kernel: list):
+        self._base_kernel = KernelCompareInfo(base_kernel)
+        self._comparison_kernel = KernelCompareInfo(comparison_kernel)
+        self._kernel_type = self._base_kernel.kernel_type \
+            if self._base_kernel.kernel_type else self._comparison_kernel.kernel_type
+        self._input_shapes = self._base_kernel.input_shapes \
+            if self._base_kernel.input_shapes else self._comparison_kernel.input_shapes
+
+    @property
+    def row(self):
+        row = [None, self._kernel_type, self._input_shapes,
+               self._base_kernel.total_dur, self._base_kernel.avg_dur,
+               self._base_kernel.max_dur, self._base_kernel.min_dur, self._base_kernel.number,
+               self._comparison_kernel.total_dur, self._comparison_kernel.avg_dur,
+               self._comparison_kernel.max_dur, self._comparison_kernel.min_dur, self._comparison_kernel.number]
+        diff_fields = [calculate_diff_ratio(self._base_kernel.total_dur, self._comparison_kernel.total_dur)[1],
+                       calculate_diff_ratio(self._base_kernel.avg_dur, self._comparison_kernel.avg_dur)[1]]
+        row.extend(diff_fields)
+        return row
\ No newline at end of file
diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py
index 9c4825c0e8e0503127e6c450042cf784e73d9974..c15396e9c597b67089acc1afd11c9f351e47b379 100644
--- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py
+++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py
@@ -12,6 +12,7 @@ class KernelDetailsBean:
         self._data = data
         self._op_type = ""
         self._name = ""
+        self._input_shapes = ""
         self._aiv_vec_time = 0.0
         self._aicore_time = 0.0
         self._mac_time = 0.0
@@ -27,6 +28,10 @@ class KernelDetailsBean:
     def name(self) -> str:
         return self._name
 
+    @property
+    def input_shapes(self) -> str:
+        return self._input_shapes
+
     @property
     def aiv_vec_time(self) -> float:
         if self._aiv_vec_time == "" or self._aiv_vec_time == "N/A":
@@ -109,6 +114,7 @@ class KernelDetailsBean:
     def init(self):
         self._op_type = self._data.get('Type', "")
         self._name = self._data.get('Name', "")
+        self._input_shapes = self._data.get('Input Shapes', "")
         self._aiv_vec_time = self._data.get('aiv_vec_time(us)', "")
         self._aicore_time = self._data.get("aicore_time(us)", "")
         self._mac_time = self._data.get('mac_time(us)', "")
diff --git a/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py
index fdce23c6ab4ff7f9f6f7d6bc1442063c57cb6098..3106527c41997287c0457d0e2f555537c79e9a50 100644
--- a/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py
+++ b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py
@@ -17,3 +17,20 @@ class OperatorDataPrepare:
             else:
                 result_data.append(level1_node)
         return result_data
+
+    def get_all_layer_ops(self) -> any:
+        root_node = TreeBuilder.build_tree(self.profiling_data.torch_op_data, [], [])
+        level1_child_nodes = root_node.child_nodes
+        node_queue = []
+        result_data = []
+        for level1_node in level1_child_nodes:
+            if level1_node.is_step_profiler():
+                node_queue.extend(level1_node.child_nodes)
+            else:
+                node_queue.append(level1_node)
+        while len(node_queue) > 0:
+            node = node_queue.pop(0)
+            result_data.append(node)
+            if node.child_nodes:
+                node_queue.extend(node.child_nodes)
+        return result_data
\ No newline at end of file
diff --git a/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py b/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py
index c89e84519302781a590523bc7fdaaf9e1254acf5..7bac2b0335329a2ef9b5e13e21feeedcf569246d 100644
--- a/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py
+++ b/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py
@@ -31,4 +31,30 @@ class OverallPerfInterface:
 
     def _generate_result(self):
         overall_data = self._profiling_data.overall_metrics
-        self._result_data = getattr(overall_data, "__dict__", {})
+
+        self._result_data = {
+            "profiling_type": overall_data.profiling_type,
+            "minimal_profiling": overall_data.minimal_profiling,
+            "overall": {"e2e_time_ms": overall_data.e2e_time_ms,
+                        "computing_time_ms": overall_data.compute_time_ms,
+                        "uncovered_communication_time_ms": overall_data.communication_not_overlapped_ms,
+                        "free_time_ms": overall_data.free_time_ms},
+            "computing_time_disaggregate": {"fa_time_ms": overall_data.fa_total_time,
+                                            "conv_time_ms": overall_data.conv_total_time,
+                                            "matmul_time_ms": overall_data.mm_total_time,
+                                            "page_attention_time_ms": overall_data.page_attention_time,
+                                            "vector_time_ms": overall_data.vector_total_time,
+                                            "tensor_move_time_ms": overall_data.sdma_time_tensor_move,
+                                            "other_cube_time_ms": overall_data.other_cube_time},
+            "computing_num_disaggregate": {"fa_num": overall_data.fa_total_num,
+                                           "conv_num": overall_data.conv_total_num,
+                                           "matmul_num": overall_data.mm_total_num,
+                                           "page_attention_num": overall_data.page_attention_num,
+                                           "vector_num": overall_data.vector_total_num,
+                                           "tensor_move_num": overall_data.sdma_num_tensor_move,
+                                           "other_cube_num": overall_data.other_cube_num},
+            "communication_time_disaggregate": {"wait_time_ms": overall_data.wait_time_ms,
+                                                "transmit_time_ms": overall_data.transmit_time_ms},
+            "free_time_disaggregate": {"sdma_time_ms": overall_data.sdma_time_stream,
+                                       "free_ms": overall_data.free_time_ms - overall_data.sdma_time_stream}
+        }
diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
index 292e312815452c78fbc71fcca9860f887b38d8d4..6fe693fb0675cc91820f859c476c61999054ec25 100644
--- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
+++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
@@ -8,6 +8,8 @@ from compare_backend.comparator.module_comparetor import ModuleComparator
 from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator
 from compare_backend.comparator.operator_comparator import OperatorComparator
 from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator
+from compare_backend.comparator.api_compare_comparator import ApiCompareComparator
+from compare_backend.comparator.kernel_compare_comparator import KernelCompareComparator
 from compare_backend.comparator.overall_metrics_comparator import OverallMetricsComparator
 from compare_backend.compare_bean.communication_bean import CommunicationBean
 from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean
@@ -16,6 +18,8 @@ from compare_backend.compare_bean.module_compare_bean import ModuleCompareBean
 from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean
 from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean
 from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean
+from compare_backend.compare_bean.api_compare_bean import ApiCompareBean
+from compare_backend.compare_bean.kernel_compare_bean import KernelCompareBean
 from compare_backend.compare_bean.overall_metrics_bean import OverallMetricsBean
 from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare
 from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare
@@ -39,8 +43,10 @@ class DetailPerformanceGenerator(BaseGenerator):
         return op_compare_result
 
     def compare(self):
-        if self._args.enable_operator_compare or self._args.enable_memory_compare or \
-                self._args.enable_communication_compare:
+        enable_compare = [self._args.enable_operator_compare, self._args.enable_memory_compare,
+                          self._args.enable_communication_compare, self._args.enable_api_compare,
+                          self._args.enable_kernel_compare]
+        if any(enable_compare):
             print("[INFO] Start to compare performance detail data, please wait.")
             comparator_list = self._create_comparator()
         else:
@@ -97,6 +103,18 @@ class DetailPerformanceGenerator(BaseGenerator):
             comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean))
             if not self._args.disable_details:
                 comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean))
+        if self._args.enable_api_compare:
+            api_compare_result = {
+                Constant.BASE_DATA: OperatorDataPrepare(
+                    self._profiling_data_dict.get(Constant.BASE_DATA)).get_all_layer_ops(),
+                Constant.COMPARISON_DATA: OperatorDataPrepare(
+                    self._profiling_data_dict.get(Constant.COMPARISON_DATA)).get_all_layer_ops()}
+            comparator_list.append(ApiCompareComparator(api_compare_result, ApiCompareBean))
+        if self._args.enable_kernel_compare:
+            kernel_compare_result = {
+                Constant.BASE_DATA: self._profiling_data_dict.get(Constant.BASE_DATA).kernel_details,
+                Constant.COMPARISON_DATA: self._profiling_data_dict.get(Constant.COMPARISON_DATA).kernel_details}
+            comparator_list.append(KernelCompareComparator(kernel_compare_result, KernelCompareBean))
         return comparator_list
 
     def match_torch_op(self) -> list:
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py
index 6ee07a65696e4c482b80238a66b0564f2c29e8f0..9daaa55ef163b157d4f200cbe039a562a865d72f 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py
@@ -20,6 +20,7 @@ class ProfilingResult:
         self.overall_metrics = ProfilingInfo(profiling_type)
         self.python_function_data = []
         self.fwdbwd_dict = {}
+        self.kernel_details = {}
 
     def update_torch_op_data(self, event: TraceEventBean):
         event.is_torch_op = True
@@ -43,6 +44,9 @@ class ProfilingResult:
     def update_comm_task_data(self, comm_name: str, task_event: TraceEventBean):
         self.communication_dict.setdefault(comm_name, {}).setdefault("comm_task", {}).setdefault(
             task_event.name, []).append(task_event.dur)
+    
+    def update_kernel_details(self, kernels: dict):
+        self.kernel_details = kernels
 
 
 class BaseProfilingParser(ABC):
@@ -57,6 +61,8 @@ class BaseProfilingParser(ABC):
         self._enable_operator_compare = args.enable_operator_compare
         self._enable_memory_compare = args.enable_memory_compare
         self._enable_communication_compare = args.enable_communication_compare
+        self._enable_api_compare = args.enable_api_compare
+        self._enable_kernel_compare = args.enable_kernel_compare
         self._dispatch_func = self._get_dispatch_func()
         self._result_data = ProfilingResult(self._profiling_type)
         self._memory_events = []
@@ -80,6 +86,10 @@ class BaseProfilingParser(ABC):
         self._cpu_cube_op = cpu_cube_op
         return self._cpu_cube_op
 
+    @abstractmethod
+    def _update_kernel_details(self):
+        raise NotImplementedError("Function _update_kernel_details need to be implemented.")
+
     @abstractmethod
     def _update_memory_list(self):
         raise NotImplementedError("Function _update_memory_list need to be implemented.")
@@ -112,6 +122,8 @@ class BaseProfilingParser(ABC):
             self._update_memory_list()
         if self._enable_profiling_compare:
             self._update_overall_metrics()
+        if self._enable_kernel_compare:
+            self._update_kernel_details()
         self._check_result_data()
         return self._result_data
 
@@ -291,7 +303,7 @@ class BaseProfilingParser(ABC):
                 task_index += 1
 
     def _check_result_data(self):
-        if self._enable_operator_compare or self._enable_memory_compare:
+        if self._enable_operator_compare or self._enable_memory_compare or self._enable_api_compare:
             if not self._result_data.torch_op_data:
                 print(f"[WARNING] Can't find any torch op in the file: {self._profiling_path}")
         if self._enable_operator_compare and not self._result_data.kernel_dict:
@@ -300,6 +312,11 @@ class BaseProfilingParser(ABC):
             print(f"[WARNING] Can't find any memory event in the file: {self._profiling_path}")
         if self._enable_communication_compare and not self._result_data.communication_dict:
             print(f"[WARNING] Can't find any communication op in the file: {self._profiling_path}")
+        if self._enable_kernel_compare and not self._result_data.kernel_details:
+            if self._profiling_type == Constant.GPU:
+                print(f"[WARNING] kernel compare between GPU data and NPU data is not supported.")
+            else:
+                print(f"[WARNING] Can't find any kernel details in the file: {self._profiling_path}")
 
     def _read_trace_event(self):
         try:
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
index 7b1ae1a5a12ac1547123f5822e63069d719a18a6..0aeeba83efb1ec62b0cf53ced7084dcccb7aa6c8 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
@@ -33,6 +33,9 @@ class GPUProfilingParser(BaseProfilingParser):
     def __is_sdma_time(cls, name: str):
         return any(mask in name.lower() for mask in cls.SDMA_MARK_LIST)
 
+    def _update_kernel_details(self):
+        pass
+
     def _update_memory_list(self):
         if not self._enable_memory_compare:
             return
@@ -171,6 +174,8 @@ class GPUProfilingParser(BaseProfilingParser):
             func_set.add(self._picking_memory_event)
         if self._enable_profiling_compare:
             func_set.add(self._picking_flow_event)
+        if self._enable_api_compare:
+            func_set.add(self._picking_torch_op_event)
         return list(func_set)
 
     def _infer_compute_stream_id(self):
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
index 457a3b6be5e6a93a9a5e2a78d028096895b6ba56..cb25c252c6c825cb22fea63a4c1ecc82f9c61e57 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
@@ -53,8 +53,32 @@ class NPUProfilingParser(BaseProfilingParser):
             func_list.add(self._picking_kernel_event)
             func_list.add(self._picking_hccl_event)
             func_list.add(self._picking_flow_event)
+        if self._enable_api_compare:
+            func_list.add(self._picking_torch_op_event)
         return list(func_list)
 
+    def _update_kernel_details(self):
+        try:
+            kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean)
+        except FileNotFoundError:
+            print("[WARNING] The file kernel_details.csv does not exist.")
+        except Exception:
+            print("[ERROR] Failed to read kernel_details.csv.")
+            return
+        if not kernel_details:
+            return
+        kernels_dict = {}
+        for kernel in kernel_details:
+            if kernel.is_invalid():
+                continue
+            input_shapes = kernel.input_shapes if kernel.input_shapes else 'N/A'
+            kernels_dict.setdefault(kernel.op_type, {}).setdefault(input_shapes, []).append(
+                [kernel.name, kernel.duration])
+        if len(kernels_dict) == 1:
+            print("[ERROR] Failed to enable enable_kernel_compare, type of kernel_details.csv is null.")
+            return
+        self._result_data.update_kernel_details(kernels_dict)
+
     def _update_memory_list(self):
         try:
             memory_data = FileReader.read_csv_file(self._operator_memory_path, OperatorMemoryBean)
diff --git a/profiler/compare_tools/compare_backend/utils/args_manager.py b/profiler/compare_tools/compare_backend/utils/args_manager.py
index 4b5947fa7bccc32277bb9d18d97ab71249c66941..ab9fb43a9681a5c5a15d280f1052f493ae8dfcde 100644
--- a/profiler/compare_tools/compare_backend/utils/args_manager.py
+++ b/profiler/compare_tools/compare_backend/utils/args_manager.py
@@ -69,6 +69,14 @@ class ArgsManager:
     def enable_communication_compare(self):
         return self._args.enable_communication_compare
 
+    @property
+    def enable_api_compare(self):
+        return self._args.enable_api_compare
+    
+    @property
+    def enable_kernel_compare(self):
+        return self._args.enable_kernel_compare
+
     @classmethod
     def check_profiling_path(cls, file_path: str):
         PathManager.input_path_common_check(file_path)
@@ -119,11 +127,14 @@ class ArgsManager:
             raise RuntimeError(msg)
 
         if not any([self._args.enable_profiling_compare, self._args.enable_operator_compare,
-                    self._args.enable_memory_compare, self._args.enable_communication_compare]):
+                    self._args.enable_memory_compare, self._args.enable_communication_compare,
+                    self._args.enable_api_compare, self._args.enable_kernel_compare]):
             self._args.enable_profiling_compare = True
             self._args.enable_operator_compare = True
             self._args.enable_memory_compare = True
             self._args.enable_communication_compare = True
+            self._args.enable_api_compare = True
+            self._args.enable_kernel_compare = True
 
         base_profiling_path = PathManager.get_realpath(self._args.base_profiling_path)
         self.check_profiling_path(base_profiling_path)
diff --git a/profiler/compare_tools/compare_backend/utils/compare_args.py b/profiler/compare_tools/compare_backend/utils/compare_args.py
index ab9bc364f440ca8412a6e40d67ca74b7c897cbd9..9e6291e89e0d8273073d1a2f4d8ec06a2dad79c1 100644
--- a/profiler/compare_tools/compare_backend/utils/compare_args.py
+++ b/profiler/compare_tools/compare_backend/utils/compare_args.py
@@ -6,6 +6,8 @@ class Args:
                  enable_operator_compare: bool = False,
                  enable_memory_compare: bool = False,
                  enable_communication_compare: bool = False,
+                 enable_api_compare: bool = False,
+                 enable_kernel_compare: bool = False,
                  output_path: str = "",
                  max_kernel_num: int = None,
                  op_name_map: dict = {},
@@ -17,6 +19,8 @@ class Args:
         self.enable_operator_compare = enable_operator_compare
         self.enable_memory_compare = enable_memory_compare
         self.enable_communication_compare = enable_communication_compare
+        self.enable_api_compare = enable_api_compare
+        self.enable_kernel_compare = enable_kernel_compare
         self.output_path = output_path
         self.max_kernel_num = max_kernel_num
         self.op_name_map = op_name_map
diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py
index e2002588024fa2a701874ffa381590b0830d2fab..252aa536e1c73d58f86071bbefab5286004bb6f9 100644
--- a/profiler/compare_tools/compare_backend/utils/constant.py
+++ b/profiler/compare_tools/compare_backend/utils/constant.py
@@ -39,13 +39,16 @@ class Constant(object):
     # compare type
     OPERATOR_COMPARE = "OperatorCompare"
     MEMORY_COMPARE = "MemoryCompare"
-
+    API_COMPARE = "ApiCompare"
+    KERNEL_COMPARE = "KernelCompare"
     # sheet name
     OPERATOR_SHEET = "OperatorCompare"
     MEMORY_SHEET = "MemoryCompare"
     OPERATOR_TOP_SHEET = "OperatorCompareStatistic"
     MEMORY_TOP_SHEET = "MemoryCompareStatistic"
     COMMUNICATION_SHEET = "CommunicationCompare"
+    API_SHEET = "ApiCompare"
+    KERNEL_SHEET = "KernelCompare"
 
     # table name
     OPERATOR_TABLE = "OperatorCompare"
@@ -57,6 +60,8 @@ class Constant(object):
     MODULE_TABLE = "ModuleCompare"
     MODULE_TOP_TABLE = "ModuleCompareStatistic"
     OVERALL_METRICS_TABLE = "OverallMetrics"
+    API_TABLE = "ApiCompare"
+    KERNEL_TABLE = "KernelCompare"    
 
     # memory
     SIZE = "Size(KB)"
diff --git a/profiler/compare_tools/compare_backend/utils/excel_config.py b/profiler/compare_tools/compare_backend/utils/excel_config.py
index ae808863e77118358800c0fce5de2a3b763ec5e4..b6be0ae2ebcf8ea51ab0ab13081bff4d24d7d69a 100644
--- a/profiler/compare_tools/compare_backend/utils/excel_config.py
+++ b/profiler/compare_tools/compare_backend/utils/excel_config.py
@@ -57,7 +57,7 @@ class ExcelConfig(object):
     DEVICE_SELF_TIME = "Device Self Time(ms)"
     DEVICE_TOTAL_TIME = "Device Total Time(ms)"
     DIFF_SELF_TIME = "Device Self Time Diff(ms)"
-    DIFF_TOTAL_RATIO = "Total Diff Ratio"
+    DIFF_TOTAL_RATIO = "Diff Total Ratio"
     DIFF_TOTAL_TIME = "Device Total Time Diff(ms)"
     DEVICE_SELF_TIME_US = "Device Self Time(us)"
     DEVICE_TOTAL_TIME_US = "Device Total Time(us)"
@@ -71,6 +71,14 @@ class ExcelConfig(object):
     DURATION = "Duration(ms)"
     DURATION_RATIO = "Duration Ratio"
     DIFF_DUR_MS = "Diff Duration(ms)"
+    API_NAME = "api name"
+    TOTAL_DURATION_MS = "Total Duration(ms)"
+    AVG_DURATION_MS = "Avg Duration(ms)"
+    SELF_TIME_MS = "Self Time(ms)"
+    DIFF_SELF_RATIO = "Diff Self Ratio"
+    DIFF_AVG_RATIO = "Diff Avg Ratio"
+    DIFF_CALLS_RATIO = "Diff Calls Ratio"
+    KERNEL = "Kernel"
 
     HEADERS = {
         Constant.OPERATOR_TABLE: [
@@ -193,7 +201,39 @@ class ExcelConfig(object):
             {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10},
             {"name": DIFF_DUR_MS, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
             {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 10},
-
+        ],
+        Constant.API_TABLE: [
+            {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10},
+            {"name": API_NAME, "type": CellFormatType.BOLD_STR, "width": 30},
+            {"name": TOTAL_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": SELF_TIME_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": AVG_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20},
+            {"name": TOTAL_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": SELF_TIME_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": AVG_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20},
+            {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": DIFF_SELF_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": DIFF_AVG_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": DIFF_CALLS_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+        ],
+        Constant.KERNEL_COMPARE: [
+            {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10},
+            {"name": KERNEL, "type": CellFormatType.BOLD_STR, "width": 30},
+            {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT,"width": 20},
+            {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20},
+            {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20},
+            {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
+            {"name": DIFF_AVG_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20},
         ]
     }
 
@@ -201,7 +241,9 @@ class ExcelConfig(object):
                 Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"],
                 Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"],
                 Constant.MODULE_TABLE: ["E1:H1", "I1:L1"],
-                Constant.OVERALL_METRICS_TABLE: ["B1:D1", "E1:G1"]}
+                Constant.OVERALL_METRICS_TABLE: ["B1:D1", "E1:G1"],
+                Constant.API_TABLE: ["C1:F1", "G1:J1"],
+                Constant.KERNEL_TABLE: ["D1:H1", "I1:M1"]}
 
     # overall metrics index
     # computing time
diff --git a/profiler/compare_tools/compare_backend/utils/torch_op_node.py b/profiler/compare_tools/compare_backend/utils/torch_op_node.py
index 690c46cd51c1e2991b0bfaf44e9af431cdad5151..69ee92d1232e5808ed428896a03718230559d12f 100644
--- a/profiler/compare_tools/compare_backend/utils/torch_op_node.py
+++ b/profiler/compare_tools/compare_backend/utils/torch_op_node.py
@@ -64,6 +64,14 @@ class TorchOpNode:
     def device_dur(self):
         return sum([kernel.device_dur for kernel in self._kernel_list])
 
+    @property
+    def api_dur(self):
+        return self._event.dur
+    
+    @property
+    def api_self_time(self):
+        return self.api_dur - sum(child.api_dur for child in self._child_nodes)
+
     def add_child_node(self, child_node):
         self._child_nodes.append(child_node)
 
diff --git a/profiler/compare_tools/compare_backend/utils/tree_builder.py b/profiler/compare_tools/compare_backend/utils/tree_builder.py
index 34c1fe1a1f4046d1e60af107f5ee74484424174a..d5aa787ac2cff1ba4a714b8522b839b0dc83bfd2 100644
--- a/profiler/compare_tools/compare_backend/utils/tree_builder.py
+++ b/profiler/compare_tools/compare_backend/utils/tree_builder.py
@@ -23,7 +23,8 @@ class TreeBuilder:
                     tree_node = TorchOpNode(event, last_node)
                     last_node.add_child_node(tree_node)
                     last_node = tree_node
-                    tree_node.set_kernel_list(kernel_dict.get(event.start_time, []))
+                    if kernel_dict:
+                        tree_node.set_kernel_list(kernel_dict.get(event.start_time, []))
                 else:
                     event.set_name(last_node.name)
                     last_node.set_memory_allocated(event)
diff --git a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py
index dffb7549fcd92af6b14ee81b019eba86996d9369..58bad621b03f855933517ef9286047e23b5681ea 100644
--- a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py
+++ b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py
@@ -12,7 +12,7 @@ class WorkSheetCreator:
         self._work_sheet = None
         self._row_id = 1
         self._field_format = {}
-        self._diff_ratio_index = None
+        self._diff_ratio_index = []
         self._col_ids = "ABCDEFGHIJKLMNOPQRSTUVW"
 
     def create_sheet(self):
@@ -47,8 +47,10 @@ class WorkSheetCreator:
             self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width"))
             self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format)
             self._field_format[index] = header.get("type")
-            if header.get("name") in (ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO):
-                self._diff_ratio_index = index
+            ratio_white_list = [ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO,
+                                ExcelConfig.DIFF_AVG_RATIO, ExcelConfig.DIFF_CALLS_RATIO, ExcelConfig.DIFF_SELF_RATIO]
+            if header.get("name") in ratio_white_list:
+                self._diff_ratio_index.append(index)
         self._row_id += 1
 
     def _write_data(self):
@@ -56,7 +58,7 @@ class WorkSheetCreator:
         for data in self._data.get("rows"):
             for index, cell_data in enumerate(data):
                 cell_format = self._work_book.add_format(self._field_format.get(index))
-                if index == self._diff_ratio_index and cell_data and cell_data > 1:
+                if index in self._diff_ratio_index and cell_data and cell_data > 1:
                     cell_format = red_ratio_format
                     cell_data = "INF" if cell_data == float('inf') else cell_data
                 self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format)
@@ -76,7 +78,7 @@ class WorkSheetCreator:
                 if index == 0:  # 0 for Index field
                     cell_style["indent"] = cell_data.count("\t")
                 cell_format = self._work_book.add_format(cell_style)
-                if index == self._diff_ratio_index and cell_data and cell_data > 1:
+                if index in self._diff_ratio_index and cell_data and cell_data > 1:
                     cell_format = red_ratio_format
                     cell_data = "INF" if cell_data == float('inf') else cell_data
                 self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format)
diff --git a/profiler/compare_tools/img/OverallMetrics.png b/profiler/compare_tools/img/OverallMetrics.png
new file mode 100644
index 0000000000000000000000000000000000000000..b130d3607344c983a9304440e38a45fe96a4bb56
Binary files /dev/null and b/profiler/compare_tools/img/OverallMetrics.png differ
diff --git a/profiler/compare_tools/performance_compare.py b/profiler/compare_tools/performance_compare.py
index 8de0a72cbdcfe958f465d338ded44bd0077c4bc0..7c9d60aac0af38c3fe3dd5f2ca9c96380438d4c9 100644
--- a/profiler/compare_tools/performance_compare.py
+++ b/profiler/compare_tools/performance_compare.py
@@ -18,6 +18,8 @@ def main():
     parser.add_argument("--enable_operator_compare", default=False, action='store_true', help="开启算子性能比较")
     parser.add_argument("--enable_memory_compare", default=False, action='store_true', help="开启算子内存比较")
     parser.add_argument("--enable_communication_compare", default=False, action='store_true', help="开启通信性能比较")
+    parser.add_argument("--enable_api_compare", default=False, action='store_true', help="开启host api性能比较")
+    parser.add_argument("--enable_kernel_compare", default=False, action='store_true', help="开启kernel性能比较")
     parser.add_argument("--disable_details", default=False, action='store_true', help="不展示比对明细")
     parser.add_argument("--output_path", type=str, default='', help="性能数据比对结果的存放路径")
     parser.add_argument("--max_kernel_num", type=int, help="每个torch op的kernel数量限制")
diff --git a/profiler/module_visualization/graph/prof_node.py b/profiler/module_visualization/graph/prof_node.py
index cfcdabbb991d2abb86f31e5a5866e788cf9a3c6e..df77d325df8fe6d6a8771dda500cc6a1ddcda1c8 100644
--- a/profiler/module_visualization/graph/prof_node.py
+++ b/profiler/module_visualization/graph/prof_node.py
@@ -18,38 +18,58 @@ from profiler.prof_common.trace_event_bean import TraceEventBean
 
 
 class ProfNode(BaseNode):
-    MODULE_TYPE = 1
 
     def __init__(self, event: TraceEventBean, parent_node=None):
         super().__init__(event, parent_node)
         self._kernel_total_list = []
+        self._communication_total_list = []
+        self._precision_index = 1
+        self._computing_time = 0
+        self._uncovered_comm_time = 0
+        self._free_time = 0
 
     @property
     def node_id(self):
         return self._event.unique_id
 
+    @property
+    def node_type(self):
+        if self._event.event_type is None:
+            return Constant.VIRTUAL_TYPE
+        return self._event.event_type
+
     @property
     def total_kernels(self):
+        if self.node_type == Constant.VIRTUAL_TYPE:
+            return [kernel for node in self.child_nodes for kernel in node.total_kernels]
         return self._kernel_total_list
 
+    @property
+    def total_communications(self):
+        if self.node_type == Constant.VIRTUAL_TYPE:
+            return [comm for node in self.child_nodes for comm in node.total_communications]
+        return self._communication_total_list
+
     @property
     def host_total_dur(self):
-        if self.is_root_node:
+        if self.node_type == Constant.VIRTUAL_TYPE:
             return sum((node.host_total_dur for node in self.child_nodes))
         return self._event.dur
 
     @property
     def host_self_dur(self):
+        if self.node_type == Constant.VIRTUAL_TYPE:
+            return 0
         return self.host_total_dur - sum((node.host_total_dur for node in self.child_nodes))
 
     @property
     def device_total_dur(self):
-        if self.is_root_node:
-            return sum((node.device_total_dur for node in self.child_nodes))
-        return sum((kernel.dur for kernel in self._kernel_total_list))
+        return sum((kernel.dur for kernel in self.total_kernels))
 
     @property
     def device_self_dur(self):
+        if self.node_type == Constant.VIRTUAL_TYPE:
+            return 0
         return self.device_total_dur - sum((node.device_total_dur for node in self.child_nodes))
 
     @property
@@ -63,18 +83,40 @@ class ProfNode(BaseNode):
             data["Input type"] = input_type
         return data
 
+    @property
+    def kernel_data(self) -> list:
+        if self.node_type == Constant.VIRTUAL_TYPE:
+            return [kernel for node in self.child_nodes for kernel in node.kernel_data]
+        return [kernel.kernel_info for kernel in self.total_kernels]
+
+    @property
+    def communication_data(self) -> list:
+        return [[comm.name, comm.dur] for comm in self.total_communications]
+
+    @property
+    def overall_data(self):
+        return {"Computing Time(us)": round(self._computing_time, 3),
+                "Uncovered Communication Time(us)": round(self._uncovered_comm_time, 3),
+                "Free Time(us)": round(self._free_time, 3)}
+
     @property
     def data(self):
-        return {"Input Data": self.input_data,
-                "Host Self Duration(us)": round(self.host_self_dur, 2),
-                "Host Total Duration(us)": round(self.host_total_dur, 2),
-                "Device Self Duration(us)": round(self.device_self_dur, 2),
-                "Device Total Duration(us)": round(self.device_total_dur, 2)}
+        data = {
+            "Overall Metrics": self.overall_data} if self.node_type != Constant.OPERATOR_TYPE else {}
+        data.update({"Input Data": self.input_data,
+                     "precision_index": self.precision_index,
+                     "Host Self Duration(us)": round(self.host_self_dur, 3),
+                     "Host Total Duration(us)": round(self.host_total_dur, 3),
+                     "Device Self Duration(us)": round(self.device_self_dur, 3),
+                     "Device Total Duration(us)": round(self.device_total_dur, 3),
+                     "kernels": self.kernel_data,
+                     "Communications": self.communication_data})
+        return data
 
     @property
     def info(self):
         return {"id": self.node_id,
-                "node_type": self.MODULE_TYPE,
+                "node_type": self.node_type,
                 "data": self.data,
                 "upnode": self.parent_node.node_id if self.parent_node else "None",
                 "subnodes": [node.node_id for node in iter(self.child_nodes)]}
@@ -83,8 +125,56 @@ class ProfNode(BaseNode):
     def is_root_node(self):
         return self.node_id == Constant.NPU_ROOT_ID
 
+    @property
+    def precision_index(self):
+        return self._precision_index
+
+    @precision_index.setter
+    def precision_index(self, precision_index):
+        self._precision_index = precision_index
+
     def update_child_nodes(self, node):
         self._child_nodes.append(node)
 
+    def reset_child_nodes(self, nodes):
+        self._child_nodes = nodes
+
     def update_kernel_total_list(self, kernel_list: list):
         self._kernel_total_list.extend(kernel_list)
+
+    def update_communication_total_list(self, communication_list: list):
+        self._communication_total_list.extend(communication_list)
+
+    def update_child_precision_index(self):
+        if not self.child_nodes:
+            return
+        max_dur = max((node.device_total_dur for node in self.child_nodes))
+        min_dur = min((node.device_total_dur for node in self.child_nodes))
+        diff_dur = max_dur - min_dur
+        for node in self.child_nodes:
+            node.precision_index = 1 - (node.device_total_dur - min_dur) / diff_dur if diff_dur else 1
+
+    def update_overall_metrics(self, overlap_analysis_event):
+        if not self.total_kernels and not self.total_communications:
+            return
+        kernel_start = min((kernel.start_time for kernel in self.total_kernels)) if self.total_kernels else float("inf")
+        kernel_end = max((kernel.end_time for kernel in self.total_kernels)) if self.total_kernels else float("-inf")
+        comm_start = min((comm.start_time for comm in self.total_communications)) \
+            if self.total_communications else float("inf")
+        comm_end = max((comm.end_time for comm in self.total_communications)) \
+            if self.total_communications else float("-inf")
+        device_start = min(kernel_start, comm_start)
+        device_end = max(kernel_end, comm_end)
+        for event in overlap_analysis_event:
+            if event.start_time >= device_end:
+                continue
+            if event.end_time <= device_start:
+                continue
+            duration_us = float(
+                min(device_end, event.end_time) - max(device_start, event.start_time))
+            if event.name == Constant.COMPUTING_EVENT:
+                self._computing_time += duration_us
+            elif event.name == Constant.FREE_EVENT:
+                self._free_time += duration_us
+            elif event.name == Constant.UNCOVERED_COMMUNICATION_EVENT:
+                self._uncovered_comm_time += duration_us
diff --git a/profiler/module_visualization/graph_build/prof_graph_builder.py b/profiler/module_visualization/graph_build/prof_graph_builder.py
index 83331b6250211e32399b05cabf19a293759a3741..9606193acd8de5d6b288ae14b4b7dd89307dfe7e 100644
--- a/profiler/module_visualization/graph_build/prof_graph_builder.py
+++ b/profiler/module_visualization/graph_build/prof_graph_builder.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from decimal import Decimal
+
 from profiler.module_visualization.graph.prof_node import ProfNode
 from profiler.module_visualization.graph_build.fwd_module_node import FwdModuleNode
 from profiler.prof_common.tree_builder import TreeBuilder
@@ -29,8 +31,11 @@ class ProfGraphBuilder:
     def _create_event_bean_from_ops(cls, op_list: list, name: str) -> TraceEventBean:
         min_start = min((op.start_time for op in iter(op_list)))
         max_end = max((op.end_time for op in iter(op_list)))
-        # 以反向算子的区间作为反向module的区间范围，为了module包含算子，做了+1 +2处理
-        return TraceEventBean({"ts": min_start - 1, "dur": float(max_end - min_start) + 2, "name": name})
+        # 以反向算子的区间作为反向module的区间范围，为了module包含算子，做了-0.0001 +0.0001处理
+        event = TraceEventBean(
+            {"ts": min_start - Decimal("0.0001"), "dur": float(max_end - min_start + Decimal("0.0001")), "name": name})
+        event.event_type = Constant.MODULE_TYPE
+        return event
 
     @classmethod
     def _trans_flow_to_dict(cls, flow_events: dict, end_events: list) -> dict:
@@ -48,6 +53,40 @@ class ProfGraphBuilder:
                 result_data.setdefault(start_point.start_time, []).append(end_event)
         return result_data
 
+    @classmethod
+    def _create_virtual_node(cls, root_node: ProfNode):
+        virtual_nodes = []
+        first_level_nodes = root_node.child_nodes
+        root_node.reset_child_nodes([])
+        merged_nodes = []
+        order_id = 1
+        for node in first_level_nodes:
+            if node.node_type == Constant.OPERATOR_TYPE:
+                merged_nodes.append(node)
+                continue
+            if len(merged_nodes) >= 2:
+                virtual_node = ProfNode(TraceEventBean({}, f"Operators_Between_Modules_{order_id}"), root_node)
+                root_node.update_child_nodes(virtual_node)
+                order_id += 1
+                for op_node in merged_nodes:
+                    op_node.parent_node = virtual_node
+                    virtual_node.update_child_nodes(op_node)
+                virtual_nodes.append(virtual_node)
+            elif len(merged_nodes) == 1:
+                root_node.update_child_nodes(merged_nodes[0])
+            root_node.update_child_nodes(node)
+            merged_nodes = []
+        if len(merged_nodes) >= 2:
+            virtual_node = ProfNode(TraceEventBean({}, f"Operators_Between_Modules_{order_id}"), root_node)
+            root_node.update_child_nodes(virtual_node)
+            for op_node in merged_nodes:
+                op_node.parent_node = virtual_node
+                virtual_node.update_child_nodes(op_node)
+            virtual_nodes.append(virtual_node)
+        elif len(merged_nodes) == 1:
+            root_node.update_child_nodes(merged_nodes[0])
+        return virtual_nodes
+
     def build_graph(self):
         self._prof_data = ProfDataPreProcess(self._prof_data_path).run()
         all_data = [*self._prof_data.get(Constant.MODULE_EVENT, []),
@@ -59,17 +98,19 @@ class ProfGraphBuilder:
             order_id = name_dict.get(event.name, 0)
             event.set_id(f"{event.name}_{order_id}")
             name_dict[event.name] = order_id + 1
-        root_node = TreeBuilder.build_tree(all_data, ProfNode, TraceEventBean({}, Constant.NPU_ROOT_ID))
-        kernel_flow_dict = self._trans_flow_to_dict(self._prof_data.get(Constant.TORCH_TO_NPU_FLOW, {}),
-                                                    self._prof_data.get(Constant.KERNEL_EVENT, []))
-        for start_time, kernels in kernel_flow_dict.items():
-            matched_node = root_node.binary_search(start_time)
-            while matched_node != Constant.INVALID_RETURN:
-                matched_node.update_kernel_total_list(kernels)
-                matched_node = matched_node.binary_search(start_time)
-        all_data = root_node.find_all_child_nodes()
-        all_data.append(root_node)
-        return all_data
+        all_nodes = TreeBuilder.build_tree(all_data, ProfNode, TraceEventBean({}, Constant.NPU_ROOT_ID))
+        if len(all_nodes) < 2:
+            msg = "Failed to build graph."
+            raise RuntimeError(msg)
+        self._update_kernel_details(all_nodes[0])
+        self._update_communication_details(all_nodes[0])
+        virtual_nodes = self._create_virtual_node(all_nodes[0])
+        all_nodes.extend(virtual_nodes)
+        for node in all_nodes:
+            node.update_child_precision_index()
+            if node.node_type != Constant.OPERATOR_TYPE:
+                node.update_overall_metrics(self._prof_data.get(Constant.OVERLAP_ANALYSIS_EVENT, []))
+        return all_nodes
 
     def find_bwd_module(self) -> list:
         bwd_module_list = []
@@ -94,22 +135,49 @@ class ProfGraphBuilder:
             if op.tid == bwd_tid:
                 bwd_op_list.append(op)
                 pre_status = Constant.BACKWARD
+                continue
             elif pre_status == Constant.BACKWARD:
                 bwd_module_list.append(self._create_event_bean_from_ops(bwd_op_list, "nn.Module: BACKWARD"))
+                bwd_module_list.extend(self._match_fwd_module(module_list, fwdbwd_flow, bwd_op_list))
                 bwd_op_list.clear()
                 pre_status = Constant.FWD_OR_OPT
+        if bwd_op_list:
+            bwd_module_list.append(self._create_event_bean_from_ops(bwd_op_list, "nn.Module: BACKWARD"))
+            bwd_module_list.extend(self._match_fwd_module(module_list, fwdbwd_flow, bwd_op_list))
+            bwd_op_list.clear()
+        return bwd_module_list
 
+    def _match_fwd_module(self, module_list, fwdbwd_flow, bwd_op_list):
         # 通过连线匹配正向module，构建出反向的整体module关系
-        root_node = TreeBuilder.build_tree(module_list, FwdModuleNode, TraceEventBean({}))
-        fwdbwd_flow_dict = self._trans_flow_to_dict(fwdbwd_flow, cpu_op_list)
+        bwd_module_list = []
+        all_nodes = TreeBuilder.build_tree(module_list, FwdModuleNode, TraceEventBean({}))
+        root_node = all_nodes[0]
+        fwdbwd_flow_dict = self._trans_flow_to_dict(fwdbwd_flow, bwd_op_list)
         for start_time, end_events in fwdbwd_flow_dict.items():
             matched_node = root_node.binary_search(start_time)
             while matched_node != Constant.INVALID_RETURN:
                 matched_node.update_bwd_op(end_events)
                 matched_node = matched_node.binary_search(start_time)
-        all_nodes = root_node.find_all_child_nodes()
         for module_node in all_nodes:
             if module_node.bwd_op_list:
                 bwd_module_list.append(
                     self._create_event_bean_from_ops(module_node.bwd_op_list, f"{module_node.name} [BACKWARD]"))
         return bwd_module_list
+
+    def _update_kernel_details(self, root_node):
+        kernel_flow_dict = self._trans_flow_to_dict(self._prof_data.get(Constant.TORCH_TO_NPU_FLOW, {}),
+                                                    self._prof_data.get(Constant.KERNEL_EVENT, []))
+        for start_time, kernels in kernel_flow_dict.items():
+            matched_node = root_node.binary_search(start_time)
+            while matched_node != Constant.INVALID_RETURN:
+                matched_node.update_kernel_total_list(kernels)
+                matched_node = matched_node.binary_search(start_time)
+
+    def _update_communication_details(self, root_node):
+        communication_flow_dict = self._trans_flow_to_dict(self._prof_data.get(Constant.TORCH_TO_NPU_FLOW, {}),
+                                                           self._prof_data.get(Constant.HCCL_EVENT, []))
+        for start_time, communications in communication_flow_dict.items():
+            matched_node = root_node.binary_search(start_time)
+            while matched_node != Constant.INVALID_RETURN:
+                matched_node.update_communication_total_list(communications)
+                matched_node = matched_node.binary_search(start_time)
diff --git a/profiler/module_visualization/prof_parse/prof_data_pre_process.py b/profiler/module_visualization/prof_parse/prof_data_pre_process.py
index 9dc820e4ca560f816b7738243197b90f1adb8c25..2b5291ea3e46a0e5d78ff6513da66f99f3073141 100644
--- a/profiler/module_visualization/prof_parse/prof_data_pre_process.py
+++ b/profiler/module_visualization/prof_parse/prof_data_pre_process.py
@@ -12,10 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import os
 
 from profiler.prof_common.file_reader import FileReader
 from profiler.prof_common.constant import Constant
+from profiler.prof_common.kernel_bean import KernelBean
 from profiler.prof_common.trace_event_bean import TraceEventBean
 
 
@@ -23,13 +25,25 @@ class ProfDataPreProcess:
     def __init__(self, prof_data_path: str):
         self._prof_data_path = prof_data_path
         self._trace_path = ""
+        self._kernel_details_path = ""
         self._kernel_pid = None
+        self._hccl_pid = None
+        self._overlap_analysis_pid = None
         self._result_data = {Constant.CPU_OP_EVENT: [], Constant.MODULE_EVENT: [], Constant.KERNEL_EVENT: [],
-                             Constant.TORCH_TO_NPU_FLOW: {}, Constant.FWD_BWD_FLOW: {}}
+                             Constant.TORCH_TO_NPU_FLOW: {}, Constant.FWD_BWD_FLOW: {}, Constant.HCCL_EVENT: [],
+                             Constant.OVERLAP_ANALYSIS_EVENT: []}
+
+    @staticmethod
+    def _check_trace_data(trace_data):
+        if not isinstance(trace_data, list):
+            msg = f"Invalid profiling data path, this feature only supports performance data " \
+                  f"collected by Ascend PyTorch Profiler."
+            raise RuntimeError(msg)
 
     def run(self) -> dict:
         self._check_trace_path()
         self._parse_trace_events()
+        self._parse_kernel_details()
         self._check_result_data()
         return self._result_data
 
@@ -50,53 +64,72 @@ class ProfDataPreProcess:
             msg = f"Invalid profiling path: {self._prof_data_path}. The data path should be the " \
                   f"folder that ends with the ascend_pt collected by the Ascend PyTorch Profiler."
             raise RuntimeError(msg)
+        kernel_path = os.path.join(profiler_output, "kernel_details.csv")
+        if os.path.isfile(kernel_path):
+            self._kernel_details_path = kernel_path
         self._trace_path = json_path
 
     def _parse_trace_events(self):
         trace_data = FileReader.read_json_file(self._trace_path)
         self._check_trace_data(trace_data)
-        iter_trace_data = iter(trace_data)
-        for event in iter_trace_data:
-            bean = TraceEventBean(event)
-            if bean.is_optimizer():
-                self._result_data[Constant.MODULE_EVENT].append(bean)
-            elif bean.is_cpu_op():
-                if not bean.is_step():
-                    self._result_data[Constant.CPU_OP_EVENT].append(bean)
-            elif bean.is_nn_module():
-                self._result_data[Constant.MODULE_EVENT].append(bean)
-            elif bean.is_torch_to_npu():
-                if bean.is_flow_start():
-                    self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(bean.id, {})["start"] = bean
-                else:
-                    self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(bean.id, {})["end"] = bean
-            elif bean.is_fwd_bwd_flow():
-                if bean.is_flow_start():
-                    self._result_data[Constant.FWD_BWD_FLOW].setdefault(bean.id, {})["start"] = bean
-                else:
-                    self._result_data[Constant.FWD_BWD_FLOW].setdefault(bean.id, {})["end"] = bean
-            elif bean.is_kernel_event(self._kernel_pid):
-                self._result_data[Constant.KERNEL_EVENT].append(bean)
-
-    def _check_trace_data(self, trace_data):
-        if not isinstance(trace_data, list):
-            msg = f"Invalid profiling data path, this feature only supports performance data " \
-                  f"collected by Ascend PyTorch Profiler."
-            raise RuntimeError(msg)
-        iter_trace_data = iter(trace_data)
+        iter_trace_data = [TraceEventBean(data) for data in trace_data]
         for event in iter_trace_data:
-            bean = TraceEventBean(event)
-            if bean.is_npu_process():
-                self._kernel_pid = bean.pid
+            if self._kernel_pid is not None and self._hccl_pid is not None and self._overlap_analysis_pid is not None:
                 break
+            if not event.is_meta():
+                continue
+            if event.is_npu_process():
+                self._kernel_pid = event.pid
+            elif event.is_hccl_process():
+                self._hccl_pid = event.pid
+            elif event.is_overlap_analysis_process():
+                self._overlap_analysis_pid = event.pid
         if self._kernel_pid is None:
-            msg = f"There is no operator on the NPU side for this data, please check whether the NPU switch is enabled."
+            msg = "There is no operator on the NPU side for this data, please check whether the NPU switch is enabled."
             raise RuntimeError(msg)
+        for event in iter_trace_data:
+            if event.is_optimizer():
+                event.event_type = Constant.MODULE_TYPE
+                self._result_data[Constant.MODULE_EVENT].append(event)
+            elif event.is_cpu_op():
+                if not event.is_step():
+                    event.event_type = Constant.OPERATOR_TYPE
+                    self._result_data[Constant.CPU_OP_EVENT].append(event)
+            elif event.is_nn_module():
+                event.event_type = Constant.MODULE_TYPE
+                self._result_data[Constant.MODULE_EVENT].append(event)
+            elif event.is_torch_to_npu():
+                if event.is_flow_start():
+                    self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(event.id, {})["start"] = event
+                else:
+                    self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(event.id, {})["end"] = event
+            elif event.is_fwd_bwd_flow():
+                if event.is_flow_start():
+                    self._result_data[Constant.FWD_BWD_FLOW].setdefault(event.id, {})["start"] = event
+                else:
+                    self._result_data[Constant.FWD_BWD_FLOW].setdefault(event.id, {})["end"] = event
+            elif event.is_kernel_event(self._kernel_pid):
+                self._result_data[Constant.KERNEL_EVENT].append(event)
+            elif event.is_hccl_event(self._hccl_pid):
+                self._result_data[Constant.HCCL_EVENT].append(event)
+            elif event.is_overlap_analysis_event(self._overlap_analysis_pid):
+                self._result_data[Constant.OVERLAP_ANALYSIS_EVENT].append(event)
+
+    def _parse_kernel_details(self):
+        if not self._kernel_details_path:
+            return
+        try:
+            all_kernels = FileReader.read_csv_file(self._kernel_details_path, KernelBean)
+        except Exception as e:
+            logging.error(e)
+        kernels = list(filter(lambda x: x.is_computing_op, all_kernels))
+        if kernels:
+            self._result_data[Constant.KERNEL_EVENT] = kernels
 
     def _check_result_data(self):
         if not self._result_data.get(Constant.CPU_OP_EVENT):
-            msg = f"This data does not have any aten operator, please make sure to enable the CPU switch."
+            msg = "This data does not have any aten operator, please make sure to enable the CPU switch."
             raise RuntimeError(msg)
-        if not self._result_data.get(Constant.MODULE_EVENT):
-            msg = f"This data does not collect any modules, please make sure to turn on the with_stack switch."
+        if not [event for event in self._result_data.get(Constant.MODULE_EVENT) if event.is_nn_module()]:
+            msg = "This data does not collect any modules, please make sure to enable the with_stack or with_modules."
             raise RuntimeError(msg)
diff --git a/profiler/prof_common/base_node.py b/profiler/prof_common/base_node.py
index b7cd6780003f9e0e5c58495ac43a893214e68beb..1e12294328151212f3137f775a161084f8e8073d 100644
--- a/profiler/prof_common/base_node.py
+++ b/profiler/prof_common/base_node.py
@@ -47,6 +47,10 @@ class BaseNode:
     def end_time(self) -> Decimal:
         return self._event.end_time
 
+    @parent_node.setter
+    def parent_node(self, parent_node):
+        self._parent_node = parent_node
+
     def update_child_nodes(self, node):
         self._child_nodes.append(node)
 
diff --git a/profiler/prof_common/constant.py b/profiler/prof_common/constant.py
index 87bc51b56bc71c2a70e35a6b08aa4de7bd521f1d..b0c8877f5698dfb263a0ec8f5a7bb031bd15a556 100644
--- a/profiler/prof_common/constant.py
+++ b/profiler/prof_common/constant.py
@@ -23,9 +23,26 @@ class Constant(object):
     CPU_OP_EVENT = "op_event"
     TORCH_TO_NPU_FLOW = "torch_to_device"
     KERNEL_EVENT = "kernel_event"
+    HCCL_EVENT = "hccl_event"
+    OVERLAP_ANALYSIS_EVENT = "overlap_event"
     FWD_BWD_FLOW = "fwd_to_bwd"
     NPU_ROOT_ID = "NPU"
 
     FWD_OR_OPT = 0
     BACKWARD = 1
     INVALID_RETURN = -1
+
+    # node type
+    MODULE_TYPE = 0
+    OPERATOR_TYPE = 1
+    VIRTUAL_TYPE = 9
+
+    # trace bar
+    NPU_BAR = "Ascend Hardware"
+    HCCL_BAR = "HCCL"
+    OVERLAP_BAR = "Overlap Analysis"
+
+    # overlap_analysis event
+    COMPUTING_EVENT = "Computing"
+    FREE_EVENT = "Free"
+    UNCOVERED_COMMUNICATION_EVENT = "Communication(Not Overlapped)"
diff --git a/profiler/prof_common/file_reader.py b/profiler/prof_common/file_reader.py
index d8a9c8fb4d6599edf46973f8e93aa708903ff007..9a225131f9485f368dd0c85c33dcbc612a53918d 100644
--- a/profiler/prof_common/file_reader.py
+++ b/profiler/prof_common/file_reader.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import csv
 import json
 import logging
 import os
@@ -57,3 +58,29 @@ class FileReader:
                 file.write(json.dumps(data, indent=indent))
         except Exception as e:
             raise RuntimeError(f"Can't create the file: {output_path}") from e
+
+    @classmethod
+    def read_csv_file(cls, file_path: str, bean_class: any = None) -> any:
+        PathManager.check_path_readable(file_path)
+        if not os.path.isfile(file_path):
+            raise FileNotFoundError("File not exists.")
+        file_size = os.path.getsize(file_path)
+        if file_size <= 0:
+            return []
+        if file_size > Constant.MAX_FILE_SIZE_5_GB:
+            check_msg = input(
+                f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]")
+            if check_msg.lower() != "y":
+                logging.warning(f"The user choose not to read the file: %s", file_path)
+                return []
+        result_data = []
+        try:
+            with open(file_path, newline="") as csv_file:
+                reader = csv.DictReader(csv_file)
+                for row in reader:
+                    row_data = bean_class(row) if bean_class else row
+                    result_data.append(row_data)
+        except Exception as e:
+            msg = f"Failed to read the file: {file_path}"
+            raise RuntimeError(msg) from e
+        return result_data
diff --git a/profiler/prof_common/kernel_bean.py b/profiler/prof_common/kernel_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d60a69080fc909a79e03374e1730608cbfa9445
--- /dev/null
+++ b/profiler/prof_common/kernel_bean.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from profiler.prof_common.utils import convert_to_decimal
+
+
+class KernelBean:
+    def __init__(self, data: dict):
+        self._name = data.get("Name", "")
+        self._op_type = data.get("Type", "")
+        self._core_type = data.get("Accelerator Core", "")
+        self._input_shape = data.get("Input Shapes", "").replace("\"", "")
+        self._input_type = data.get("Input Data Types", "")
+        self._input_format = data.get("Input Formats", "")
+        self._duration = data.get("Duration(us)", 0)
+        self._ts = data.get("Start Time(us)", "")
+
+    @property
+    def start_time(self):
+        return convert_to_decimal(self._ts)
+
+    @property
+    def end_time(self):
+        return self.start_time + convert_to_decimal(self.dur)
+
+    @property
+    def is_computing_op(self):
+        return self._core_type != "HCCL"
+
+    @property
+    def dur(self):
+        return float(self._duration)
+
+    @property
+    def kernel_info(self):
+        return [self._name, self._op_type, self._core_type, self._input_shape, self._input_type, self.dur]
diff --git a/profiler/prof_common/trace_event_bean.py b/profiler/prof_common/trace_event_bean.py
index 2d4b96e4f6aa84ce225531da89085ba4a07335a5..f1ba62e69b93bc1cbcca1abbc2e0c2d86224545d 100644
--- a/profiler/prof_common/trace_event_bean.py
+++ b/profiler/prof_common/trace_event_bean.py
@@ -14,14 +14,16 @@
 # limitations under the License.
 from decimal import Decimal
 
+from profiler.prof_common.constant import Constant
 from profiler.prof_common.utils import convert_to_decimal
 from profiler.prof_common.analyze_dict import AnalyzeDict
 
 
 class TraceEventBean(AnalyzeDict):
-    def __init__(self, data: dict, unique_id: int = None):
+    def __init__(self, data: dict, unique_id: str = None):
         super().__init__(data)
         self._id = unique_id
+        self._type = None
 
     @property
     def unique_id(self):
@@ -35,6 +37,18 @@ class TraceEventBean(AnalyzeDict):
     def end_time(self) -> Decimal:
         return self.start_time + convert_to_decimal(self.dur)
 
+    @property
+    def kernel_info(self):
+        return [self.name, self.args.get("Task Type", ""), self.dur]
+
+    @property
+    def event_type(self):
+        return self._type
+
+    @event_type.setter
+    def event_type(self, event_type):
+        self._type = event_type
+
     def set_id(self, name_id):
         self._id = name_id
 
@@ -62,8 +76,23 @@ class TraceEventBean(AnalyzeDict):
     def is_flow_end(self):
         return self.ph == "f"
 
+    def is_meta(self):
+        return self.ph == "M"
+
     def is_kernel_event(self, kernel_pid):
         return self.ph == "X" and self.pid == kernel_pid
 
+    def is_hccl_event(self, hccl_pid):
+        return self.ph == "X" and self.pid == hccl_pid and self.name.startswith("hcom_")
+
+    def is_overlap_analysis_event(self, overlap_analysis_pid):
+        return self.ph == "X" and self.pid == overlap_analysis_pid
+
     def is_npu_process(self):
-        return self.ph == "M" and self.name == "process_name" and self.args.get("name", "") == "Ascend Hardware"
+        return self.ph == "M" and self.name == "process_name" and self.args.get("name", "") == Constant.NPU_BAR
+
+    def is_hccl_process(self):
+        return self.ph == "M" and self.name == "process_name" and self.args.get("name", "") == Constant.HCCL_BAR
+
+    def is_overlap_analysis_process(self):
+        return self.ph == "M" and self.name == "process_name" and self.args.get("name", "") == Constant.OVERLAP_BAR
diff --git a/profiler/prof_common/tree_builder.py b/profiler/prof_common/tree_builder.py
index b7d3e1baf6aa48c480124056ced422178f8fe7a2..b6311c1a9374c4b13e06afd6273d26269fbc1d45 100644
--- a/profiler/prof_common/tree_builder.py
+++ b/profiler/prof_common/tree_builder.py
@@ -19,8 +19,10 @@ class TreeBuilder:
     @staticmethod
     def build_tree(event_list: list, node_class: any, root_bean: any):
         root_node = node_class(root_bean)
+        all_nodes = [root_node] + [None] * len(event_list)
         event_list.sort(key=lambda x: x.start_time)
         last_node = root_node
+        index = 1
         for event in event_list:
             while last_node:
                 if last_node != root_node and event.start_time > last_node.end_time:
@@ -28,6 +30,8 @@ class TreeBuilder:
                     continue
                 tree_node = node_class(event, last_node)
                 last_node.update_child_nodes(tree_node)
+                all_nodes[index] = tree_node
                 last_node = tree_node
+                index += 1
                 break
-        return root_node
+        return all_nodes
diff --git a/profiler/test/run_ut.py b/profiler/test/run_ut.py
index ee27abaace177c7eab9021bcd3fcf51f0368c832..6ab208dc29e9d5feb2418f9243d395a1aabfa23b 100644
--- a/profiler/test/run_ut.py
+++ b/profiler/test/run_ut.py
@@ -13,6 +13,7 @@ def set_python_path():
         os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor")
     advisor_backend_root = os.path.join(
         os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor", "advisor_backend")
+    profiler_parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     # Update PYTHONPATH
     python_path = os.environ.get("PYTHONPATH", "")
     if not python_path:
@@ -22,6 +23,7 @@ def set_python_path():
     python_path += f":{compare_tools_root}"
     python_path += f":{advisor_root}"
     python_path += f":{advisor_backend_root}"
+    python_path += f":{profiler_parent_dir}"
     os.environ["PYTHONPATH"] = python_path
 
 
diff --git a/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_dataloader_checker.py b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_dataloader_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d8e22b7c66df5c865887e39d4e32e117ea129ee
--- /dev/null
+++ b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_dataloader_checker.py
@@ -0,0 +1,65 @@
+import unittest
+import os
+import sys
+import yaml
+
+from profiler.advisor.analyzer.dataloader.dataloader_checker import DataloaderChecker
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env
+
+
+class TestDataloaderChecker(unittest.TestCase):
+    @classmethod
+    def tearDownClass(cls) -> None:
+        recover_env()
+
+    def setUp(self) -> None:
+        rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))))),
+            "advisor", "rules", "dataloader.yaml")
+
+        with open(rule_path, "rb") as file:
+            self.rule = yaml.safe_load(file)
+
+    def test_no_dataloader(self):
+        dataloader_duration = (self.rule.get("dataloader_duration_threshold") - 1) * 1000
+        dataset = self._get_mock_dataset(dataloader_duration, is_empty_dataset=True)
+
+        checker = DataloaderChecker()
+        checker.check_slow_dataloader(dataset)
+        self.assertFalse(checker.dataloader_issues)
+
+    def test_no_slow_dataloader(self):
+        dataloader_duration = (self.rule.get("dataloader_duration_threshold") - 1) * 1000
+        dataset = self._get_mock_dataset(dataloader_duration, is_empty_dataset=False)
+        checker = DataloaderChecker()
+        checker.check_slow_dataloader(dataset)
+        self.assertFalse(checker.dataloader_issues)
+
+    def test_found_slow_dataloader(self):
+        dataloader_duration = (self.rule.get("dataloader_duration_threshold") + 1) * 1000
+        dataset = self._get_mock_dataset(dataloader_duration, is_empty_dataset=False)
+        checker = DataloaderChecker()
+        checker.check_slow_dataloader(dataset)
+        self.assertTrue(checker.dataloader_issues)
+
+        desc = self.rule.get("problem").format(dataloader_duration=dataloader_duration / 1000,
+                                               dataloader_duration_threshold=self.rule.get(
+                                                   "dataloader_duration_threshold"))
+
+        self.assertEqual(desc, checker.desc)
+
+    def _get_mock_dataset(self, dur, is_empty_dataset=False):
+        dataset = TimelineEvent()
+        if is_empty_dataset:
+            return dataset
+
+        dataset["dataloader"] = [TimelineEvent({"dur": dur, "name": "dataloader"})]
+        return dataset
+
+
+if __name__ == '__main__':
+    tester = TestDataloaderChecker()
+    tester.test_no_dataloader()
+    tester.test_no_slow_dataloader()
+    tester.test_found_slow_dataloader()
diff --git a/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_syncbn_checker.py b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_syncbn_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1df810a0ec9fcc28d28d73836ec6bb2ec86b6db
--- /dev/null
+++ b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_syncbn_checker.py
@@ -0,0 +1,62 @@
+import unittest
+import os
+import sys
+import yaml
+
+from profiler.advisor.analyzer.schedule.syncbn.syncbn_checker import SyncBNChecker
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env
+
+
+class TestSyncBNChecker(unittest.TestCase):
+    @classmethod
+    def tearDownClass(cls) -> None:
+        recover_env()
+
+    def setUp(self) -> None:
+        rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))))),
+            "advisor", "rules", "sync_batchnorm.yaml")
+
+        with open(rule_path, "rb") as file:
+            self.rule = yaml.safe_load(file)
+
+    def test_no_syncbn(self):
+        dataset = self._get_mock_dataset(1, is_empty_dataset=True)
+
+        checker = SyncBNChecker()
+        checker.check_syncbn(dataset)
+        self.assertFalse(checker.syncbn_issues)
+
+    def test_syncbn_not_reach_threshold(self):
+        dataset = self._get_mock_dataset(self.rule.get("max_syncbn_num") - 1, is_empty_dataset=False)
+        checker = SyncBNChecker()
+        checker.check_syncbn(dataset)
+        self.assertFalse(checker.syncbn_issues)
+
+    def test_found_slow_dataloader(self):
+        dataset = self._get_mock_dataset(self.rule.get("max_syncbn_num") + 1, is_empty_dataset=False)
+        checker = SyncBNChecker()
+        checker.check_syncbn(dataset)
+        self.assertTrue(checker.syncbn_issues)
+
+        desc = self.rule.get("problem").format(syncbn_num=self.rule.get("max_syncbn_num") + 1)
+
+        self.assertEqual(desc, checker.desc)
+
+    def _get_mock_dataset(self, syncbn_num, is_empty_dataset=False):
+        dataset = TimelineEvent()
+        if is_empty_dataset:
+            return dataset
+
+        dataset["sync_batchnorm"] = []
+        for _ in range(syncbn_num):
+            dataset["sync_batchnorm"].append(TimelineEvent({"name": "SyncBatchNorm"}))
+        return dataset
+
+
+if __name__ == '__main__':
+    tester = TestSyncBNChecker()
+    tester.test_no_syncbn()
+    tester.test_syncbn_not_reach_threshold()
+    tester.test_found_slow_dataloader()
diff --git a/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_synchronize_stream.py b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_synchronize_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..360363ce371afb43cb61abd1a5b5fc2b2720aecc
--- /dev/null
+++ b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_synchronize_stream.py
@@ -0,0 +1,55 @@
+import unittest
+import os
+import sys
+import yaml
+
+from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_checker import SynchronizeStreamChecker
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env
+
+
+class TestSynchronizeChecker(unittest.TestCase):
+    @classmethod
+    def tearDownClass(cls) -> None:
+        recover_env()
+
+    def setUp(self) -> None:
+        rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))))),
+            "advisor", "rules", "synchronize.yaml")
+
+        with open(rule_path, "rb") as file:
+            self.rule = yaml.safe_load(file)
+
+    def test_no_synchronize_stream(self):
+        dataset = self._get_mock_dataset(1, [], is_empty_dataset=True)
+
+        checker = SynchronizeStreamChecker()
+        checker.check_synchronize(dataset)
+        self.assertFalse(checker.synchronize_issues)
+
+    def test_max_synchronize_stream(self):
+        dataset = self._get_mock_dataset(100, [], is_empty_dataset=False)
+        checker = SynchronizeStreamChecker()
+        checker.check_synchronize(dataset)
+        self.assertFalse(checker.synchronize_issues)
+
+    def _get_mock_dataset(self, total_count, slow_synchronize_stream, is_empty_dataset=False):
+        dataset = TimelineEvent()
+        if is_empty_dataset:
+            return dataset
+
+        dataset["synchronize_stream"] = TimelineEvent(
+            dict(
+                total_count=total_count,
+                slow_synchronize_stream=slow_synchronize_stream,
+                rule=dict(max_synchronize_num=10, problem="", solutions=[]),
+            )
+        )
+        return dataset
+
+
+if __name__ == '__main__':
+    tester = TestSynchronizeChecker()
+    tester.test_no_synchronize_stream()
+    tester.test_max_synchronize_stream()
diff --git a/profiler/test/ut/advisor/compute_advice/test_frequency_advice.py b/profiler/test/ut/advisor/compute_advice/test_frequency_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..51acf3b8e247d39ddd07e4505e84f3f0110ad782
--- /dev/null
+++ b/profiler/test/ut/advisor/compute_advice/test_frequency_advice.py
@@ -0,0 +1,145 @@
+import os
+import shutil
+import stat
+import json
+
+import unittest
+from profiler.advisor.interface.interface import Interface
+from profiler.advisor.common.analyzer_scopes import SupportedScopes
+
+
+class TestFrequencyAdvice(unittest.TestCase):
+    TMP_DIR = "./ascend_pt"
+    OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT"
+    DEVICE_DIR = "./ascend_pt/PROF_000001_20240415174447255_OAANHDOMMJMHGIFC/device_0"
+    interface = None
+    err_interface = None
+
+    def tearDown(self):
+        if os.path.exists(TestFrequencyAdvice.TMP_DIR):
+            shutil.rmtree(TestFrequencyAdvice.TMP_DIR)
+        self.clear_htmls()
+
+    def setUp(self):
+        if os.path.exists(TestFrequencyAdvice.TMP_DIR):
+            shutil.rmtree(TestFrequencyAdvice.TMP_DIR)
+        if not os.path.exists(TestFrequencyAdvice.TMP_DIR):
+            os.makedirs(TestFrequencyAdvice.TMP_DIR)
+        if not os.path.exists(TestFrequencyAdvice.OUTPUT_DIR):
+            os.makedirs(TestFrequencyAdvice.OUTPUT_DIR)
+        if not os.path.exists(TestFrequencyAdvice.DEVICE_DIR):
+            os.makedirs(TestFrequencyAdvice.DEVICE_DIR)
+        self.clear_htmls()
+
+    @classmethod
+    def clear_htmls(cls):
+        current_path = os.path.dirname(os.path.abspath(__file__))
+        for filename in os.listdir(current_path):
+            # 检查文件是否以“att”开头
+            if filename.startswith("att"):
+                # 构建文件的完整路径
+                file_path = os.path.join(current_path, filename)
+                # 删除文件
+                os.remove(file_path)
+
+    @classmethod
+    def get_basic_trace_view(cls):
+        # Python pid
+        py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}}
+        # ascend pid
+        ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}}
+        # ascend pid
+        cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}}
+        # ascend hardware ops
+        ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4,
+                     "args": {"Task Type": "AI_CORE"}}
+        ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106888", "dur": 80, "tid": 3, "pid": 4,
+                     "args": {"Task Type": "AI_CORE"}}
+        # flow event
+        flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}}
+        flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}}
+        return [py_pid_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, flow_event_s, flow_event_e]
+
+    @classmethod
+    def create_info_json(cls):
+        info = {
+            "DeviceInfo": [
+                {
+                    "id": 7,
+                    "env_type": 3,
+                    "ctrl_cpu_id": "ARMv8_Cortex_A55",
+                    "ctrl_cpu_core_num": 1,
+                    "ctrl_cpu_endian_little": 1,
+                    "ts_cpu_core_num": 0,
+                    "ai_cpu_core_num": 6,
+                    "ai_core_num": 25,
+                    "ai_cpu_core_id": 2,
+                    "ai_core_id": 0,
+                    "aicpu_occupy_bitmap": 252,
+                    "ctrl_cpu": "0",
+                    "ai_cpu": "2,3,4,5,6",
+                    "aiv_num": 50,
+                    "hwts_frequency": "49.999001",
+                    "aic_frequency": "1850",
+                    "aiv_frequency": "1850"
+                }
+            ]
+        }
+        with os.fdopen(os.open(f"{TestFrequencyAdvice.DEVICE_DIR}/info.json.0",
+                               os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+            fp.write(json.dumps(info))
+
+    @classmethod
+    def create_non_910B_trace_view(cls):
+        basic_info = cls.get_basic_trace_view()
+
+        # python ops
+        py_event1 = {"ph": "X", "cat": "python_function", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2,
+                     "pid": 1,
+                     "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}}
+        py_event2 = {"ph": "X", "cat": "python_function", "name": "slice", "ts": "199", "dur": 200, "tid": 2, "pid": 1,
+                     "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}}
+        raw_data = [
+            *basic_info, py_event1, py_event2
+        ]
+        with os.fdopen(os.open(f"{TestFrequencyAdvice.OUTPUT_DIR}/trace_view.json",
+        # with os.fdopen(os.open(f"{TestFrequencyAdvice.OUTPUT_DIR}/msprof_20240415174455.json",
+                               os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+            fp.write(json.dumps(raw_data))
+
+    @classmethod
+    def create_910B_trace_view(cls):
+        basic_info = cls.get_basic_trace_view()
+
+        # python ops
+        py_event1 = {"name": "AI Core Freq", "ts": "1699529623106000.061", "pid": 682820896, "tid": 0,
+                     "args": {"MHz": 1850}, "ph": "C"}
+        py_event2 = {"name": "AI Core Freq", "ts": "1699529623106770.541", "pid": 682820896, "tid": 0,
+                     "args": {"MHz": 800}, "ph": "C"}
+        raw_data = [
+            *basic_info, py_event1, py_event2
+        ]
+
+        with os.fdopen(os.open(f"{TestFrequencyAdvice.OUTPUT_DIR}/trace_view.json",
+                               os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+            fp.write(json.dumps(raw_data))
+
+    def test_run_should_run_success_when_msprof_not_contain_frequency_data(self):
+        self.create_info_json()
+        self.create_non_910B_trace_view()
+        interface = Interface(profiling_path=self.TMP_DIR)
+        dimension = "computation"
+        scope = SupportedScopes.FREQ_ANALYSIS
+        result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR)
+        self.assertEqual(0, len(result.data.get("AI Core Frequency", [])))
+        result.clear()
+
+    def test_run_should_run_success_when_trace_view_contain_frequency_data(self):
+        self.create_info_json()
+        self.create_910B_trace_view()
+        interface = Interface(profiling_path=self.TMP_DIR)
+        dimension = "computation"
+        scope = SupportedScopes.FREQ_ANALYSIS
+        result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR)
+        self.assertEqual(2, len(result.data.get("AI Core Frequency", dict).get("data", [])))
+        result.clear()
diff --git a/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py b/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py
index 44d97b248e68331a49388dac4a6a21fecb9f5285..80734635929597fff2f5a1bbbe79582817ba2858 100644
--- a/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py
+++ b/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py
@@ -24,6 +24,11 @@ class ProfilingParser(BaseProfilingParser):
         self._enable_operator_compare = True
         self._enable_memory_compare = True
         self._enable_communication_compare = True
+        self._enable_kernel_compare = True
+        self._enable_api_compare = True
+
+    def _update_kernel_details(self):
+        pass
 
     def _update_memory_list(self):
         pass