From 6477f296135bd8d1817b5a1ceaeaa86c551c965e Mon Sep 17 00:00:00 2001
From: qianggee <qjchenb@163.com>
Date: Fri, 26 Jul 2024 07:25:59 +0000
Subject: [PATCH 1/3] support vpp

---
 .../accuracy_tools/kj600/kj600/module_hook.py | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py
index d13736aeb20..6d1b25023e5 100644
--- a/debug/accuracy_tools/kj600/kj600/module_hook.py
+++ b/debug/accuracy_tools/kj600/kj600/module_hook.py
@@ -147,6 +147,7 @@ class TrainerMon:
         anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None
         
         self.optimizer_hooked = False
+        self.vpp = False
         output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output')
         cur_time = datetime.now().strftime('%b%d_%H-%M-%S')
         unique_id = str(uuid.uuid4())[:8]
@@ -490,16 +491,33 @@ class TrainerMon:
             with torch.no_grad():
                 context.grad_acc += grad
 
+        def register_hooks(model_chunk, prefix=''):
+            for param_name, param in model_chunk.named_parameters():
+                name = prefix + param_name
+                for target in self.config['targets'].keys():
+                    context = self.grad_context[name]
+                    if param_name.startswith(target) and param.requires_grad:
+                        self._smallest_rank_print(f'>> monitoring: {name}')
+                        self.param2name[param] = name
+                        param.register_hook(partial(param_hook, context=context))
+                        context.grad_acc = torch.zeros_like(param).to(DEVICE)
+
+
         if self.print_struct:
             self.module_struct = {
                 module_name: 1. for module_name, module in model.named_modules()}
             return
 
-        for name, param in model.named_parameters():
-            for target in self.config['targets'].keys():
-                context = self.grad_context[name]
-                if name.startswith(target) and param.requires_grad:
-                    self._smallest_rank_print(f'>> monitoring: {name}')
-                    self.param2name[param] = name
-                    param.register_hook(partial(param_hook, context=context))
-                    context.grad_acc = torch.zeros_like(param).to(DEVICE)
+        if isinstance(model, list):
+            if len(model) > 1:
+                self.vpp = True
+                self._smallest_rank_print('vpp enabled')
+            
+            for vpp_stage, model_chunk in enumerate(model):
+                prefix = f'{vpp_stage}_' if self.vpp else ''
+                register_hooks(model_chunk, prefix=prefix)
+        
+        else:
+            register_hooks(model)
+            
+        
-- 
Gitee


From 7a02fd3732936a65d69aa5fbc71fa2938eafde9c Mon Sep 17 00:00:00 2001
From: qianggee <qjchenb@163.com>
Date: Thu, 1 Aug 2024 03:11:01 +0000
Subject: [PATCH 2/3] monitor gnorm before reduce by model hook

---
 .../accuracy_tools/kj600/kj600/module_hook.py | 148 +++++++++++-------
 1 file changed, 90 insertions(+), 58 deletions(-)

diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py
index 6d1b25023e5..8b8826a84d0 100644
--- a/debug/accuracy_tools/kj600/kj600/module_hook.py
+++ b/debug/accuracy_tools/kj600/kj600/module_hook.py
@@ -83,12 +83,14 @@ class GradContext:
     def __init__(self) -> None:
         self.pre = []
         self.post = []
-        self.grad_acc = None
+        self.grad_acc = {}
+        self.micro_step = -1
 
     def reset(self):
         self.pre.clear()
         self.post.clear()
-        self.grad_acc.fill_(0.)
+        for k,v in self.grad_acc.items():
+            v.fill_(0.)
 
 
 class TrainerMon:
@@ -101,7 +103,7 @@ class TrainerMon:
         self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext)
         self.optimizer_context = defaultdict(OptimizerContext)
         self.cc_context = defaultdict(CommunicationContext)
-        self.grad_context = defaultdict(GradContext)
+        self.grad_context = GradContext()
         self.params_have_main_grad = params_have_main_grad
         self.config = get_config(config_file_path)
         self.module_rank_list = self.config.get("module_ranks", [])
@@ -160,7 +162,12 @@ class TrainerMon:
         # A HeatmapVisualizer instance is associated with an image
         self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer)
         self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer)
-        self.micro_batch_number = 0
+        self.micro_batch_number = 1
+        self.step = -1
+        self.rank = dist.get_rank()
+
+        self.weight_hooked = False
+        self.optimizer_hooked = False
 
         self.param_name_list = []
         self.param2name = defaultdict(str)
@@ -220,11 +227,47 @@ class TrainerMon:
             self.hook_optimizer()
         return
 
+    def _get_wg_metric(self, tag='pre_grad'):
+        grad_dict = {}
+        for param, name in self.param2name.items():
+            grad = param.main_grad if self.params_have_main_grad else param.grad
+            if grad is None:
+                print_warn_log(f"grad is None: {name}, maybe something wrong happened.")
+                continue
+            key = get_summary_writer_tag_name(name, tag, self.rank)
+            grad_dict[key]  = grad
+        metric_dict = {}
+        for metric_name in self.ops:
+            metric_dict[metric_name] = get_metrics(metric_name, grad_dict, self.eps)
+        return metric_dict
+
+
     def monitor_gnorm_with_ad(self, model, grad_acc_steps):
-        self._hook_weights(model)
         self.hook_optimizer()
+
+        if self.print_struct:
+            self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)}
+            return
+        self._register_param_name(model)
+        
         self.micro_batch_number = grad_acc_steps
 
+        def model_backward_hook(module, input_grad, output_grad):
+            
+            if self.wg_distribution:
+                self.grad_context.micro_step += 1 ## error if vpp
+                if self.grad_context.micro_step == (self.micro_batch_number - 1):
+                    print('>> pre grad from backward')
+                    self.grad_context.micro_step = -1
+                    wg_metric_dict = self._get_wg_metric(tag='pre_grad')
+                    self.grad_context.pre.append(wg_metric_dict)
+
+        # for model_chunk in model:
+        #     model_chunk.register_full_backward_hook(model_backward_hook)
+
+        self._hook_weights()
+
+
     def build_tbtag_tensor_map(self, module_name, tag, tensor):
         metrics = {}
         rank = dist.get_rank() if dist.is_initialized() else None
@@ -277,13 +320,12 @@ class TrainerMon:
         if not self.wg_distribution:
             return
 
-        for name in self.param2name.values():
-            context = self.grad_context[name]
-            for metric_name in self.ops:
-                write_metrics_tensorboard(metric_name, self.summary_writer, context.pre, step)
-                write_metrics_tensorboard(metric_name, self.summary_writer, context.post, step)
+        for metric_name in self.ops:
+            # write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.pre, step)
+            write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.post, step)
+            write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.xx, step)
 
-            context.reset()
+        self.grad_context.reset()
 
     def hook_optimizer(self):
         # in DDP by default use params_have_main_grad
@@ -303,7 +345,18 @@ class TrainerMon:
             context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self,
                 optimizer, self.param2name)
             
-            rank = dist.get_rank() if dist.is_initialized() else None
+            if self.wg_distribution:
+                if self.weight_hooked:
+                    print('>>> pre grad from weight hook')
+                    metric_dict = {}
+                    for metric_name in self.ops:
+                        metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.grad_acc, self.eps)
+                    # self.grad_context.grad.append(metric_dict)
+                    self.grad_context.xx = [metric_dict]
+  
+                wg_metric_dict = self._get_wg_metric(tag='post_grad')
+                self.grad_context.post.append(wg_metric_dict)
+
             for param, name in self.param2name.items():
                 if "params_effrank" in self.config and name in self.config["params_effrank"]:
                     context.param_effective_rank[name] = eff_rank(param.detach())
@@ -311,18 +364,6 @@ class TrainerMon:
                 if grad is None:
                     print_warn_log(f"grad is None: {name}, maybe something wrong happened.")
                     continue
-                if self.wg_distribution:
-                    metric_dict = {}
-                    key = get_summary_writer_tag_name(name, 'post_grad', rank)
-                    for metric_name in self.ops:
-                        metric_dict[metric_name] = get_metrics(metric_name, {key: grad}, self.eps)
-                    self.grad_context[name].post.append(metric_dict)
-
-                    metric_dict = {}
-                    key = get_summary_writer_tag_name(name, 'pre_grad', rank)
-                    for metric_name in self.ops:
-                        metric_dict[metric_name] = get_metrics(metric_name, {key: self.grad_context[name].grad_acc}, self.eps)
-                    self.grad_context[name].pre.append(metric_dict)
 
                 if self.mg_direction:
                     if context.step == 0:
@@ -390,6 +431,22 @@ class TrainerMon:
         else:
             print_info_log(msg)
 
+    def _register_param_name(self, model):
+        if isinstance(model, list):
+            if len(model) > 1:
+                self.vpp = True
+                self._smallest_rank_print('vpp enabled')
+            
+            for vpp_stage, model_chunk in enumerate(model):
+                prefix = f'{vpp_stage}_' if self.vpp else ''
+                for param_name, param in model_chunk.named_parameters():
+                    name = prefix + param_name
+                    for target in self.config['targets'].keys():
+                        if param_name.startswith(target) and param.requires_grad:
+                            self._smallest_rank_print(f'>> monitoring: {name}')
+                            self.param2name[param] = name
+
+
     def _hook_module(self, target_names, module: torch.nn.Module, fwd_or_bkd):
         if '_modules' not in module.__dict__:
             # nothing to hook
@@ -483,41 +540,16 @@ class TrainerMon:
                 hooked_count += 1
         return hooked_count
 
-    def _hook_weights(self, model):
-        self.wg_distribution = True
-        rank = dist.get_rank() if dist.is_initialized() else None
+    def _hook_weights(self):
+        context = self.grad_context
 
-        def param_hook(grad, context):
+        def param_hook(grad, grad_acc):
             with torch.no_grad():
-                context.grad_acc += grad
-
-        def register_hooks(model_chunk, prefix=''):
-            for param_name, param in model_chunk.named_parameters():
-                name = prefix + param_name
-                for target in self.config['targets'].keys():
-                    context = self.grad_context[name]
-                    if param_name.startswith(target) and param.requires_grad:
-                        self._smallest_rank_print(f'>> monitoring: {name}')
-                        self.param2name[param] = name
-                        param.register_hook(partial(param_hook, context=context))
-                        context.grad_acc = torch.zeros_like(param).to(DEVICE)
+                grad_acc += grad
 
+        for param, name in self.param2name.items():
+            key = get_summary_writer_tag_name(name, 'acc_grad', self.rank)
+            context.grad_acc[key] = torch.zeros_like(param).to(DEVICE)
+            param.register_hook(partial(param_hook, grad_acc=context.grad_acc[key]))
 
-        if self.print_struct:
-            self.module_struct = {
-                module_name: 1. for module_name, module in model.named_modules()}
-            return
-
-        if isinstance(model, list):
-            if len(model) > 1:
-                self.vpp = True
-                self._smallest_rank_print('vpp enabled')
-            
-            for vpp_stage, model_chunk in enumerate(model):
-                prefix = f'{vpp_stage}_' if self.vpp else ''
-                register_hooks(model_chunk, prefix=prefix)
-        
-        else:
-            register_hooks(model)
-            
-        
+        self.weight_hooked = True
\ No newline at end of file
-- 
Gitee


From 133c95ce5a0556828c432b3dfc84834391005255 Mon Sep 17 00:00:00 2001
From: qianggee <qjchenb@163.com>
Date: Mon, 5 Aug 2024 07:58:46 +0000
Subject: [PATCH 3/3] support csv output

---
 .../kj600/kj600/anomaly_detect.py             | 53 ++++++++++-
 .../accuracy_tools/kj600/kj600/module_hook.py | 95 ++++++++++---------
 .../kj600/kj600/module_metric.py              | 27 ++++--
 debug/accuracy_tools/kj600/kj600/utils.py     | 27 +++++-
 4 files changed, 150 insertions(+), 52 deletions(-)

diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py
index cbd7b6daa2f..5a98aabb863 100644
--- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py
+++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py
@@ -1,10 +1,13 @@
+import os
 import statistics  as st
 from abc import ABC
 from typing import List
+import pandas as pd
 import sys
 from torch.utils.tensorboard import SummaryWriter
 from collections import defaultdict
-from kj600.utils import print_info_log
+from kj600.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory
+
 
 class ScanRule(ABC):
     def apply(self, history, cur):
@@ -59,6 +62,54 @@ class bcolors:
     BOLD = '\033[1m'
     UNDERLINE = '\033[4m'
 
+
+class CSVWriterWithAD:
+    def __init__(self, path, ad_rules, job_id, anomaly_inform=False):
+        self.path = path
+        create_directory(path)
+        self.tag2scalars = defaultdict(list)
+        self.ad_rules = ad_rules
+        self.job_id = job_id
+        self.anomaly_inform = anomaly_inform
+        self.context_dict = defaultdict(list)
+        self.header = []
+
+    def write_csv(self, prefix, step):
+        if len(self.context_dict) == 0:
+            return
+        filepath = os.path.join(self.path, f'{prefix}_{step}.csv')
+        if not os.path.exists(filepath):
+            make_file_safety(filepath)
+            data_frame = pd.DataFrame(columns=self.header)
+            data_frame.to_csv(filepath, index=False)
+
+        check_file_valid_writable(filepath)
+        new_data = pd.DataFrame([[name]+metric_value for name, metric_value in self.context_dict.items()])
+        new_data.to_csv(filepath, mode='a+', header=False, index=False)
+        self.context_dict = defaultdict(list)
+    
+    def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False):
+        new_avg = avg = scalar_value
+        if tag in self.tag2scalars:
+            N = len(self.tag2scalars[tag])
+            _, avg = self.tag2scalars[tag][-1]
+            new_avg = (avg*N + scalar_value)/(N + 1)
+        self.tag2scalars[tag].append((scalar_value, new_avg))    
+        detected, rule_name = self._ad(scalar_value, history=avg)
+        if detected:
+            print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}")
+            exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}"
+            if self.anomaly_inform:
+                self.anomaly_inform.run(exception_message, self.job_id)
+
+        name = tag.split('/')[0]
+        self.context_dict[name].append(scalar_value)
+
+    def _ad(self, scalar_value, history):
+        return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value)
+
+
+
 class SummaryWriterWithAD(SummaryWriter):
     def __init__(self, path, ad_rules, job_id, anomaly_inform=False):
         super().__init__(path)
diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py
index 8b8826a84d0..129323340b5 100644
--- a/debug/accuracy_tools/kj600/kj600/module_hook.py
+++ b/debug/accuracy_tools/kj600/kj600/module_hook.py
@@ -11,9 +11,9 @@ from kj600.module_spec_verifier import get_config, validate_config_spec
 from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon
 from kj600.features import eff_rank, get_sign_matches
 from kj600.visualizer import HeatmapVisualizer
-from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD
+from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD, CSVWriterWithAD
 from kj600.anomaly_inform import AnomalyInformFactory
-from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics
+from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics
 from kj600.distributed.wrap_distributed import api_register, create_hooks,  op_aggregate
 from kj600.utils import print_warn_log, print_info_log, get_param_struct
 
@@ -83,14 +83,17 @@ class GradContext:
     def __init__(self) -> None:
         self.pre = []
         self.post = []
-        self.grad_acc = {}
+        self.acc_metric = []
+        self.acc = {}
         self.micro_step = -1
 
     def reset(self):
         self.pre.clear()
         self.post.clear()
-        for k,v in self.grad_acc.items():
+        self.acc_metric.clear()
+        for k,v in self.acc.items():
             v.fill_(0.)
+            
 
 
 class TrainerMon:
@@ -107,12 +110,12 @@ class TrainerMon:
         self.params_have_main_grad = params_have_main_grad
         self.config = get_config(config_file_path)
         self.module_rank_list = self.config.get("module_ranks", [])
+        self.format = self.config.get('format', 'tensorboard')
         self.eps = self.config.get('eps', 1e-8)
         self.ops = self.config.get('ops', [])
         self.xy_distribution = self.config.get('xy_distribution', False)
         if not self.xy_distribution:
             print_rank_0("> module input/output input_grad/output_grad is not monitored. ")
-        
         # backward hook cause megatron-lm pipeline parallel schedule assert exception. 
         # TBD: backward hook cause output tensor is view of some base tensor. root cause invesigation pending.
         self.forward_only = self.config.get('forward_only', False) 
@@ -153,12 +156,19 @@ class TrainerMon:
         output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output')
         cur_time = datetime.now().strftime('%b%d_%H-%M-%S')
         unique_id = str(uuid.uuid4())[:8]
+
+        if self.format == 'tensorboard':
+            writer = SummaryWriterWithAD
+            self.write_metrics = write_metrics_tensorboard
+        elif self.format == 'csv':
+            writer = CSVWriterWithAD
+            self.write_metrics = write_metrics_csv
         if dist.is_initialized():
             if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0:
-                self.summary_writer = SummaryWriterWithAD(
+                self.summary_writer = writer(
                     os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform)
         else:
-            self.summary_writer = SummaryWriterWithAD(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform)
+            self.summary_writer = writer(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform)
         # A HeatmapVisualizer instance is associated with an image
         self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer)
         self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer)
@@ -221,13 +231,13 @@ class TrainerMon:
             for name, param in model.named_parameters():
                 print_rank_0(f"\t{name}")
                 for target_module, _ in self.config['targets'].items():
-                    if name.startswith(target_module): # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0
+                    if name.startswith(target_module) and param.requires_grad: # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0
                         self.param_name_list.append(name)
                         self.param2name[param] = name
             self.hook_optimizer()
         return
 
-    def _get_wg_metric(self, tag='pre_grad'):
+    def _get_wg_metric(self, tag):
         grad_dict = {}
         for param, name in self.param2name.items():
             grad = param.main_grad if self.params_have_main_grad else param.grad
@@ -244,28 +254,15 @@ class TrainerMon:
 
     def monitor_gnorm_with_ad(self, model, grad_acc_steps):
         self.hook_optimizer()
+        self.micro_batch_number = grad_acc_steps
+        self.wg_distribution = True
 
         if self.print_struct:
             self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)}
             return
         self._register_param_name(model)
-        
-        self.micro_batch_number = grad_acc_steps
-
-        def model_backward_hook(module, input_grad, output_grad):
-            
-            if self.wg_distribution:
-                self.grad_context.micro_step += 1 ## error if vpp
-                if self.grad_context.micro_step == (self.micro_batch_number - 1):
-                    print('>> pre grad from backward')
-                    self.grad_context.micro_step = -1
-                    wg_metric_dict = self._get_wg_metric(tag='pre_grad')
-                    self.grad_context.pre.append(wg_metric_dict)
-
-        # for model_chunk in model:
-        #     model_chunk.register_full_backward_hook(model_backward_hook)
-
-        self._hook_weights()
+        self._hook_model_for_grad_acc(model)
+        # self._hook_weights()
 
 
     def build_tbtag_tensor_map(self, module_name, tag, tensor):
@@ -305,25 +302,23 @@ class TrainerMon:
         for _, fwd_context in self.module_fwd_hook_context_by_module.items():
             if not len(fwd_context.actv) == self.micro_batch_number:
                 print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}")
-            for metric_name in self.ops:
-                write_metrics_tensorboard(metric_name, self.summary_writer, fwd_context.actv, step)
+            self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv')
             fwd_context.actv.clear()
 
         for _, bwd_context in self.module_bwd_hook_context_by_module.items():
             if not len(bwd_context.actvgrad) == self.micro_batch_number:
                 print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}")
-            for metric_name in self.ops:
-                write_metrics_tensorboard(metric_name, self.summary_writer, bwd_context.actvgrad, step)
+            self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv')
             bwd_context.actvgrad.clear()
 
     def write_grad_tb(self, step):
         if not self.wg_distribution:
             return
 
-        for metric_name in self.ops:
-            # write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.pre, step)
-            write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.post, step)
-            write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.xx, step)
+        self.write_metrics(self.ops, self.summary_writer, self.grad_context.pre, step, 'grad_unreduced')
+        self.write_metrics(self.ops, self.summary_writer, self.grad_context.post, step, 'grad_reduced')
+        if self.weight_hooked:
+            self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_accumulated')
 
         self.grad_context.reset()
 
@@ -347,12 +342,10 @@ class TrainerMon:
             
             if self.wg_distribution:
                 if self.weight_hooked:
-                    print('>>> pre grad from weight hook')
                     metric_dict = {}
                     for metric_name in self.ops:
-                        metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.grad_acc, self.eps)
-                    # self.grad_context.grad.append(metric_dict)
-                    self.grad_context.xx = [metric_dict]
+                        metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.acc, self.eps)
+                    self.grad_context.acc_metric = [metric_dict]
   
                 wg_metric_dict = self._get_wg_metric(tag='post_grad')
                 self.grad_context.post.append(wg_metric_dict)
@@ -388,6 +381,7 @@ class TrainerMon:
                 cc_metrics = self.generate_cc_metrics(k, c)
                 for op, m in cc_metrics.items():
                     metric_dict[op].update(m)
+            
             if not metric_dict:
                 return
             context.metric_list.append(metric_dict)
@@ -407,10 +401,8 @@ class TrainerMon:
                 for param_name, _ in context.param_adam_ratio.items():
                     self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer)
 
-            for metric_name in self.ops:
-                if not context.metric_list:
-                    break
-                write_metrics_tensorboard(metric_name, self.summary_writer, context.metric_list, context.step)
+            if context.metric_list:
+                self.write_metrics(self.ops, self.summary_writer, context.metric_list, context.step, 'other')
             context.metric_list.clear()
             context.step += 1
 
@@ -540,6 +532,21 @@ class TrainerMon:
                 hooked_count += 1
         return hooked_count
 
+    def _hook_model_for_grad_acc(self, model):
+        def model_backward_hook(module, input_grad, output_grad):
+            model_chunk.micro_step += 1 ## error if vpp
+            if model_chunk.micro_step == (self.micro_batch_number):
+                model_chunk.micro_step = 0
+                wg_metric_dict = self._get_wg_metric(tag='pre_grad')
+                self.grad_context.pre.append(wg_metric_dict)
+        
+        if not isinstance(model, list):
+            model = [model]
+
+        for model_chunk in model:
+            setattr(model_chunk,'micro_step', 0)
+            model_chunk.register_full_backward_hook(model_backward_hook)
+
     def _hook_weights(self):
         context = self.grad_context
 
@@ -549,7 +556,7 @@ class TrainerMon:
 
         for param, name in self.param2name.items():
             key = get_summary_writer_tag_name(name, 'acc_grad', self.rank)
-            context.grad_acc[key] = torch.zeros_like(param).to(DEVICE)
-            param.register_hook(partial(param_hook, grad_acc=context.grad_acc[key]))
+            context.acc[key] = torch.zeros_like(param).to(DEVICE)
+            param.register_hook(partial(param_hook, grad_acc=context.acc[key]))
 
         self.weight_hooked = True
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py
index e09536b072c..57934ed8199 100644
--- a/debug/accuracy_tools/kj600/kj600/module_metric.py
+++ b/debug/accuracy_tools/kj600/kj600/module_metric.py
@@ -150,9 +150,24 @@ def get_metrics(metric_name, tag2tensor, eps):
         raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
 
 
-def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step):
-    try:
-        fun_metric = config_metric_registry[metric_name]
-        return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step)
-    except KeyError as e:
-        raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
+def write_metrics_tensorboard(ops, summary_writer, metric_value, step, prefix=''):
+    for metric_name in ops:
+        try:
+            fun_metric = config_metric_registry[metric_name]
+            fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step)
+        except KeyError as e:
+            raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
+
+def write_metrics_csv(ops, summary_writer, metric_value, step, preifx=''):
+    for metric_name in ops:
+        try:
+            fun_metric = config_metric_registry[metric_name]
+            fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step)
+            
+        except KeyError as e:
+            print(e)
+            raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e
+    
+    if not summary_writer.header:
+        summary_writer.header = ['param_name'] + ops
+    summary_writer.write_csv(preifx, step)
\ No newline at end of file
diff --git a/debug/accuracy_tools/kj600/kj600/utils.py b/debug/accuracy_tools/kj600/kj600/utils.py
index 53d47d99886..3aed6911c44 100644
--- a/debug/accuracy_tools/kj600/kj600/utils.py
+++ b/debug/accuracy_tools/kj600/kj600/utils.py
@@ -107,4 +107,29 @@ def check_file_valid_readable(path):
 def check_file_valid_writable(path):
     check_file_valid(path)
     check_path_writability(path)
-    
\ No newline at end of file
+    
+
+def make_file_safety(file_path: str, permission=0o640):
+    if os.path.islink(file_path):
+        raise RuntimeError(f"Invalid soft link path: {file_path}")
+    file_real_path = os.path.realpath(file_path)
+    if os.path.exists(file_real_path):
+        return
+    parent_path = os.path.dirname(file_real_path)
+    if not os.path.exists(parent_path):
+        os.makedirs(parent_path, mode=0o750, exist_ok=True)
+    if not os.access(parent_path, os.W_OK):
+        raise PermissionError(f"The path {parent_path} is not writable!")
+    try:
+        os.close(os.open(file_real_path, os.O_WRONLY | os.O_CREAT, permission))
+    except OSError as e:
+        raise RuntimeError("Can't create file: " + file_real_path) from e
+    os.chmod(file_real_path, permission)
+
+
+def create_directory(dir_path):
+    dir_path = os.path.realpath(dir_path)
+    try:
+        os.makedirs(dir_path, mode=0o750, exist_ok=True)
+    except OSError as ex:
+        raise RuntimeError("Failed to create directory. Please check the path permission or disk space.") from ex
\ No newline at end of file
-- 
Gitee