From 6477f296135bd8d1817b5a1ceaeaa86c551c965e Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 26 Jul 2024 07:25:59 +0000 Subject: [PATCH 1/3] support vpp --- .../accuracy_tools/kj600/kj600/module_hook.py | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index d13736aeb20..6d1b25023e5 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -147,6 +147,7 @@ class TrainerMon: anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None self.optimizer_hooked = False + self.vpp = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] @@ -490,16 +491,33 @@ class TrainerMon: with torch.no_grad(): context.grad_acc += grad + def register_hooks(model_chunk, prefix=''): + for param_name, param in model_chunk.named_parameters(): + name = prefix + param_name + for target in self.config['targets'].keys(): + context = self.grad_context[name] + if param_name.startswith(target) and param.requires_grad: + self._smallest_rank_print(f'>> monitoring: {name}') + self.param2name[param] = name + param.register_hook(partial(param_hook, context=context)) + context.grad_acc = torch.zeros_like(param).to(DEVICE) + + if self.print_struct: self.module_struct = { module_name: 1. for module_name, module in model.named_modules()} return - for name, param in model.named_parameters(): - for target in self.config['targets'].keys(): - context = self.grad_context[name] - if name.startswith(target) and param.requires_grad: - self._smallest_rank_print(f'>> monitoring: {name}') - self.param2name[param] = name - param.register_hook(partial(param_hook, context=context)) - context.grad_acc = torch.zeros_like(param).to(DEVICE) + if isinstance(model, list): + if len(model) > 1: + self.vpp = True + self._smallest_rank_print('vpp enabled') + + for vpp_stage, model_chunk in enumerate(model): + prefix = f'{vpp_stage}_' if self.vpp else '' + register_hooks(model_chunk, prefix=prefix) + + else: + register_hooks(model) + + -- Gitee From 7a02fd3732936a65d69aa5fbc71fa2938eafde9c Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 1 Aug 2024 03:11:01 +0000 Subject: [PATCH 2/3] monitor gnorm before reduce by model hook --- .../accuracy_tools/kj600/kj600/module_hook.py | 148 +++++++++++------- 1 file changed, 90 insertions(+), 58 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 6d1b25023e5..8b8826a84d0 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -83,12 +83,14 @@ class GradContext: def __init__(self) -> None: self.pre = [] self.post = [] - self.grad_acc = None + self.grad_acc = {} + self.micro_step = -1 def reset(self): self.pre.clear() self.post.clear() - self.grad_acc.fill_(0.) + for k,v in self.grad_acc.items(): + v.fill_(0.) class TrainerMon: @@ -101,7 +103,7 @@ class TrainerMon: self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext) self.optimizer_context = defaultdict(OptimizerContext) self.cc_context = defaultdict(CommunicationContext) - self.grad_context = defaultdict(GradContext) + self.grad_context = GradContext() self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) self.module_rank_list = self.config.get("module_ranks", []) @@ -160,7 +162,12 @@ class TrainerMon: # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) - self.micro_batch_number = 0 + self.micro_batch_number = 1 + self.step = -1 + self.rank = dist.get_rank() + + self.weight_hooked = False + self.optimizer_hooked = False self.param_name_list = [] self.param2name = defaultdict(str) @@ -220,11 +227,47 @@ class TrainerMon: self.hook_optimizer() return + def _get_wg_metric(self, tag='pre_grad'): + grad_dict = {} + for param, name in self.param2name.items(): + grad = param.main_grad if self.params_have_main_grad else param.grad + if grad is None: + print_warn_log(f"grad is None: {name}, maybe something wrong happened.") + continue + key = get_summary_writer_tag_name(name, tag, self.rank) + grad_dict[key] = grad + metric_dict = {} + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, grad_dict, self.eps) + return metric_dict + + def monitor_gnorm_with_ad(self, model, grad_acc_steps): - self._hook_weights(model) self.hook_optimizer() + + if self.print_struct: + self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)} + return + self._register_param_name(model) + self.micro_batch_number = grad_acc_steps + def model_backward_hook(module, input_grad, output_grad): + + if self.wg_distribution: + self.grad_context.micro_step += 1 ## error if vpp + if self.grad_context.micro_step == (self.micro_batch_number - 1): + print('>> pre grad from backward') + self.grad_context.micro_step = -1 + wg_metric_dict = self._get_wg_metric(tag='pre_grad') + self.grad_context.pre.append(wg_metric_dict) + + # for model_chunk in model: + # model_chunk.register_full_backward_hook(model_backward_hook) + + self._hook_weights() + + def build_tbtag_tensor_map(self, module_name, tag, tensor): metrics = {} rank = dist.get_rank() if dist.is_initialized() else None @@ -277,13 +320,12 @@ class TrainerMon: if not self.wg_distribution: return - for name in self.param2name.values(): - context = self.grad_context[name] - for metric_name in self.ops: - write_metrics_tensorboard(metric_name, self.summary_writer, context.pre, step) - write_metrics_tensorboard(metric_name, self.summary_writer, context.post, step) + for metric_name in self.ops: + # write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.pre, step) + write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.post, step) + write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.xx, step) - context.reset() + self.grad_context.reset() def hook_optimizer(self): # in DDP by default use params_have_main_grad @@ -303,7 +345,18 @@ class TrainerMon: context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name) - rank = dist.get_rank() if dist.is_initialized() else None + if self.wg_distribution: + if self.weight_hooked: + print('>>> pre grad from weight hook') + metric_dict = {} + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.grad_acc, self.eps) + # self.grad_context.grad.append(metric_dict) + self.grad_context.xx = [metric_dict] + + wg_metric_dict = self._get_wg_metric(tag='post_grad') + self.grad_context.post.append(wg_metric_dict) + for param, name in self.param2name.items(): if "params_effrank" in self.config and name in self.config["params_effrank"]: context.param_effective_rank[name] = eff_rank(param.detach()) @@ -311,18 +364,6 @@ class TrainerMon: if grad is None: print_warn_log(f"grad is None: {name}, maybe something wrong happened.") continue - if self.wg_distribution: - metric_dict = {} - key = get_summary_writer_tag_name(name, 'post_grad', rank) - for metric_name in self.ops: - metric_dict[metric_name] = get_metrics(metric_name, {key: grad}, self.eps) - self.grad_context[name].post.append(metric_dict) - - metric_dict = {} - key = get_summary_writer_tag_name(name, 'pre_grad', rank) - for metric_name in self.ops: - metric_dict[metric_name] = get_metrics(metric_name, {key: self.grad_context[name].grad_acc}, self.eps) - self.grad_context[name].pre.append(metric_dict) if self.mg_direction: if context.step == 0: @@ -390,6 +431,22 @@ class TrainerMon: else: print_info_log(msg) + def _register_param_name(self, model): + if isinstance(model, list): + if len(model) > 1: + self.vpp = True + self._smallest_rank_print('vpp enabled') + + for vpp_stage, model_chunk in enumerate(model): + prefix = f'{vpp_stage}_' if self.vpp else '' + for param_name, param in model_chunk.named_parameters(): + name = prefix + param_name + for target in self.config['targets'].keys(): + if param_name.startswith(target) and param.requires_grad: + self._smallest_rank_print(f'>> monitoring: {name}') + self.param2name[param] = name + + def _hook_module(self, target_names, module: torch.nn.Module, fwd_or_bkd): if '_modules' not in module.__dict__: # nothing to hook @@ -483,41 +540,16 @@ class TrainerMon: hooked_count += 1 return hooked_count - def _hook_weights(self, model): - self.wg_distribution = True - rank = dist.get_rank() if dist.is_initialized() else None + def _hook_weights(self): + context = self.grad_context - def param_hook(grad, context): + def param_hook(grad, grad_acc): with torch.no_grad(): - context.grad_acc += grad - - def register_hooks(model_chunk, prefix=''): - for param_name, param in model_chunk.named_parameters(): - name = prefix + param_name - for target in self.config['targets'].keys(): - context = self.grad_context[name] - if param_name.startswith(target) and param.requires_grad: - self._smallest_rank_print(f'>> monitoring: {name}') - self.param2name[param] = name - param.register_hook(partial(param_hook, context=context)) - context.grad_acc = torch.zeros_like(param).to(DEVICE) + grad_acc += grad + for param, name in self.param2name.items(): + key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) + context.grad_acc[key] = torch.zeros_like(param).to(DEVICE) + param.register_hook(partial(param_hook, grad_acc=context.grad_acc[key])) - if self.print_struct: - self.module_struct = { - module_name: 1. for module_name, module in model.named_modules()} - return - - if isinstance(model, list): - if len(model) > 1: - self.vpp = True - self._smallest_rank_print('vpp enabled') - - for vpp_stage, model_chunk in enumerate(model): - prefix = f'{vpp_stage}_' if self.vpp else '' - register_hooks(model_chunk, prefix=prefix) - - else: - register_hooks(model) - - + self.weight_hooked = True \ No newline at end of file -- Gitee From 133c95ce5a0556828c432b3dfc84834391005255 Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 07:58:46 +0000 Subject: [PATCH 3/3] support csv output --- .../kj600/kj600/anomaly_detect.py | 53 ++++++++++- .../accuracy_tools/kj600/kj600/module_hook.py | 95 ++++++++++--------- .../kj600/kj600/module_metric.py | 27 ++++-- debug/accuracy_tools/kj600/kj600/utils.py | 27 +++++- 4 files changed, 150 insertions(+), 52 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index cbd7b6daa2f..5a98aabb863 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -1,10 +1,13 @@ +import os import statistics as st from abc import ABC from typing import List +import pandas as pd import sys from torch.utils.tensorboard import SummaryWriter from collections import defaultdict -from kj600.utils import print_info_log +from kj600.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory + class ScanRule(ABC): def apply(self, history, cur): @@ -59,6 +62,54 @@ class bcolors: BOLD = '\033[1m' UNDERLINE = '\033[4m' + +class CSVWriterWithAD: + def __init__(self, path, ad_rules, job_id, anomaly_inform=False): + self.path = path + create_directory(path) + self.tag2scalars = defaultdict(list) + self.ad_rules = ad_rules + self.job_id = job_id + self.anomaly_inform = anomaly_inform + self.context_dict = defaultdict(list) + self.header = [] + + def write_csv(self, prefix, step): + if len(self.context_dict) == 0: + return + filepath = os.path.join(self.path, f'{prefix}_{step}.csv') + if not os.path.exists(filepath): + make_file_safety(filepath) + data_frame = pd.DataFrame(columns=self.header) + data_frame.to_csv(filepath, index=False) + + check_file_valid_writable(filepath) + new_data = pd.DataFrame([[name]+metric_value for name, metric_value in self.context_dict.items()]) + new_data.to_csv(filepath, mode='a+', header=False, index=False) + self.context_dict = defaultdict(list) + + def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): + new_avg = avg = scalar_value + if tag in self.tag2scalars: + N = len(self.tag2scalars[tag]) + _, avg = self.tag2scalars[tag][-1] + new_avg = (avg*N + scalar_value)/(N + 1) + self.tag2scalars[tag].append((scalar_value, new_avg)) + detected, rule_name = self._ad(scalar_value, history=avg) + if detected: + print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") + exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" + if self.anomaly_inform: + self.anomaly_inform.run(exception_message, self.job_id) + + name = tag.split('/')[0] + self.context_dict[name].append(scalar_value) + + def _ad(self, scalar_value, history): + return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) + + + class SummaryWriterWithAD(SummaryWriter): def __init__(self, path, ad_rules, job_id, anomaly_inform=False): super().__init__(path) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 8b8826a84d0..129323340b5 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -11,9 +11,9 @@ from kj600.module_spec_verifier import get_config, validate_config_spec from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer -from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD +from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD, CSVWriterWithAD from kj600.anomaly_inform import AnomalyInformFactory -from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics +from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate from kj600.utils import print_warn_log, print_info_log, get_param_struct @@ -83,14 +83,17 @@ class GradContext: def __init__(self) -> None: self.pre = [] self.post = [] - self.grad_acc = {} + self.acc_metric = [] + self.acc = {} self.micro_step = -1 def reset(self): self.pre.clear() self.post.clear() - for k,v in self.grad_acc.items(): + self.acc_metric.clear() + for k,v in self.acc.items(): v.fill_(0.) + class TrainerMon: @@ -107,12 +110,12 @@ class TrainerMon: self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) self.module_rank_list = self.config.get("module_ranks", []) + self.format = self.config.get('format', 'tensorboard') self.eps = self.config.get('eps', 1e-8) self.ops = self.config.get('ops', []) self.xy_distribution = self.config.get('xy_distribution', False) if not self.xy_distribution: print_rank_0("> module input/output input_grad/output_grad is not monitored. ") - # backward hook cause megatron-lm pipeline parallel schedule assert exception. # TBD: backward hook cause output tensor is view of some base tensor. root cause invesigation pending. self.forward_only = self.config.get('forward_only', False) @@ -153,12 +156,19 @@ class TrainerMon: output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] + + if self.format == 'tensorboard': + writer = SummaryWriterWithAD + self.write_metrics = write_metrics_tensorboard + elif self.format == 'csv': + writer = CSVWriterWithAD + self.write_metrics = write_metrics_csv if dist.is_initialized(): if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0: - self.summary_writer = SummaryWriterWithAD( + self.summary_writer = writer( os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) else: - self.summary_writer = SummaryWriterWithAD(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) + self.summary_writer = writer(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) @@ -221,13 +231,13 @@ class TrainerMon: for name, param in model.named_parameters(): print_rank_0(f"\t{name}") for target_module, _ in self.config['targets'].items(): - if name.startswith(target_module): # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 + if name.startswith(target_module) and param.requires_grad: # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 self.param_name_list.append(name) self.param2name[param] = name self.hook_optimizer() return - def _get_wg_metric(self, tag='pre_grad'): + def _get_wg_metric(self, tag): grad_dict = {} for param, name in self.param2name.items(): grad = param.main_grad if self.params_have_main_grad else param.grad @@ -244,28 +254,15 @@ class TrainerMon: def monitor_gnorm_with_ad(self, model, grad_acc_steps): self.hook_optimizer() + self.micro_batch_number = grad_acc_steps + self.wg_distribution = True if self.print_struct: self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)} return self._register_param_name(model) - - self.micro_batch_number = grad_acc_steps - - def model_backward_hook(module, input_grad, output_grad): - - if self.wg_distribution: - self.grad_context.micro_step += 1 ## error if vpp - if self.grad_context.micro_step == (self.micro_batch_number - 1): - print('>> pre grad from backward') - self.grad_context.micro_step = -1 - wg_metric_dict = self._get_wg_metric(tag='pre_grad') - self.grad_context.pre.append(wg_metric_dict) - - # for model_chunk in model: - # model_chunk.register_full_backward_hook(model_backward_hook) - - self._hook_weights() + self._hook_model_for_grad_acc(model) + # self._hook_weights() def build_tbtag_tensor_map(self, module_name, tag, tensor): @@ -305,25 +302,23 @@ class TrainerMon: for _, fwd_context in self.module_fwd_hook_context_by_module.items(): if not len(fwd_context.actv) == self.micro_batch_number: print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") - for metric_name in self.ops: - write_metrics_tensorboard(metric_name, self.summary_writer, fwd_context.actv, step) + self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv') fwd_context.actv.clear() for _, bwd_context in self.module_bwd_hook_context_by_module.items(): if not len(bwd_context.actvgrad) == self.micro_batch_number: print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") - for metric_name in self.ops: - write_metrics_tensorboard(metric_name, self.summary_writer, bwd_context.actvgrad, step) + self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') bwd_context.actvgrad.clear() def write_grad_tb(self, step): if not self.wg_distribution: return - for metric_name in self.ops: - # write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.pre, step) - write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.post, step) - write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.xx, step) + self.write_metrics(self.ops, self.summary_writer, self.grad_context.pre, step, 'grad_unreduced') + self.write_metrics(self.ops, self.summary_writer, self.grad_context.post, step, 'grad_reduced') + if self.weight_hooked: + self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_accumulated') self.grad_context.reset() @@ -347,12 +342,10 @@ class TrainerMon: if self.wg_distribution: if self.weight_hooked: - print('>>> pre grad from weight hook') metric_dict = {} for metric_name in self.ops: - metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.grad_acc, self.eps) - # self.grad_context.grad.append(metric_dict) - self.grad_context.xx = [metric_dict] + metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.acc, self.eps) + self.grad_context.acc_metric = [metric_dict] wg_metric_dict = self._get_wg_metric(tag='post_grad') self.grad_context.post.append(wg_metric_dict) @@ -388,6 +381,7 @@ class TrainerMon: cc_metrics = self.generate_cc_metrics(k, c) for op, m in cc_metrics.items(): metric_dict[op].update(m) + if not metric_dict: return context.metric_list.append(metric_dict) @@ -407,10 +401,8 @@ class TrainerMon: for param_name, _ in context.param_adam_ratio.items(): self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer) - for metric_name in self.ops: - if not context.metric_list: - break - write_metrics_tensorboard(metric_name, self.summary_writer, context.metric_list, context.step) + if context.metric_list: + self.write_metrics(self.ops, self.summary_writer, context.metric_list, context.step, 'other') context.metric_list.clear() context.step += 1 @@ -540,6 +532,21 @@ class TrainerMon: hooked_count += 1 return hooked_count + def _hook_model_for_grad_acc(self, model): + def model_backward_hook(module, input_grad, output_grad): + model_chunk.micro_step += 1 ## error if vpp + if model_chunk.micro_step == (self.micro_batch_number): + model_chunk.micro_step = 0 + wg_metric_dict = self._get_wg_metric(tag='pre_grad') + self.grad_context.pre.append(wg_metric_dict) + + if not isinstance(model, list): + model = [model] + + for model_chunk in model: + setattr(model_chunk,'micro_step', 0) + model_chunk.register_full_backward_hook(model_backward_hook) + def _hook_weights(self): context = self.grad_context @@ -549,7 +556,7 @@ class TrainerMon: for param, name in self.param2name.items(): key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) - context.grad_acc[key] = torch.zeros_like(param).to(DEVICE) - param.register_hook(partial(param_hook, grad_acc=context.grad_acc[key])) + context.acc[key] = torch.zeros_like(param).to(DEVICE) + param.register_hook(partial(param_hook, grad_acc=context.acc[key])) self.weight_hooked = True \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index e09536b072c..57934ed8199 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -150,9 +150,24 @@ def get_metrics(metric_name, tag2tensor, eps): raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e -def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step): - try: - fun_metric = config_metric_registry[metric_name] - return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) - except KeyError as e: - raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e +def write_metrics_tensorboard(ops, summary_writer, metric_value, step, prefix=''): + for metric_name in ops: + try: + fun_metric = config_metric_registry[metric_name] + fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) + except KeyError as e: + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e + +def write_metrics_csv(ops, summary_writer, metric_value, step, preifx=''): + for metric_name in ops: + try: + fun_metric = config_metric_registry[metric_name] + fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) + + except KeyError as e: + print(e) + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e + + if not summary_writer.header: + summary_writer.header = ['param_name'] + ops + summary_writer.write_csv(preifx, step) \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/utils.py b/debug/accuracy_tools/kj600/kj600/utils.py index 53d47d99886..3aed6911c44 100644 --- a/debug/accuracy_tools/kj600/kj600/utils.py +++ b/debug/accuracy_tools/kj600/kj600/utils.py @@ -107,4 +107,29 @@ def check_file_valid_readable(path): def check_file_valid_writable(path): check_file_valid(path) check_path_writability(path) - \ No newline at end of file + + +def make_file_safety(file_path: str, permission=0o640): + if os.path.islink(file_path): + raise RuntimeError(f"Invalid soft link path: {file_path}") + file_real_path = os.path.realpath(file_path) + if os.path.exists(file_real_path): + return + parent_path = os.path.dirname(file_real_path) + if not os.path.exists(parent_path): + os.makedirs(parent_path, mode=0o750, exist_ok=True) + if not os.access(parent_path, os.W_OK): + raise PermissionError(f"The path {parent_path} is not writable!") + try: + os.close(os.open(file_real_path, os.O_WRONLY | os.O_CREAT, permission)) + except OSError as e: + raise RuntimeError("Can't create file: " + file_real_path) from e + os.chmod(file_real_path, permission) + + +def create_directory(dir_path): + dir_path = os.path.realpath(dir_path) + try: + os.makedirs(dir_path, mode=0o750, exist_ok=True) + except OSError as ex: + raise RuntimeError("Failed to create directory. Please check the path permission or disk space.") from ex \ No newline at end of file -- Gitee