diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/anomaly_detect.py b/debug/accuracy_tools/msprobe/mindspore/monitor/anomaly_detect.py index 3544ebbd025614349585bc799b15e00a5c2c7956..9eed28575f7a843f71050221ac59ff9df65bcb4d 100644 --- a/debug/accuracy_tools/msprobe/mindspore/monitor/anomaly_detect.py +++ b/debug/accuracy_tools/msprobe/mindspore/monitor/anomaly_detect.py @@ -162,9 +162,8 @@ class TrainStage: OPTIMIZER_STAGE = 2 -FORWARD_KEY = [MonitorConst.ACTV_IN, MonitorConst.ACTV_OUT] -BACKWARD_KEY = [MonitorConst.ACTVGRAD_IN, MonitorConst.ACTVGRAD_OUT, - MonitorConst.PRE_GRAD, MonitorConst.POST_GRAD, MonitorConst.ACC_GRAD] +FORWARD_KEY = [MonitorConst.ACTV] +BACKWARD_KEY = [MonitorConst.ACTVGRAD, MonitorConst.PRE_GRAD, MonitorConst.POST_GRAD, MonitorConst.ACC_GRAD] OPTIMIZER_KEY = [MonitorConst.EXP_AVG, MonitorConst.EXP_AVG_SQ] TRAIN_STAGE = { **{key_: TrainStage.FORWARD_STAGE for key_ in FORWARD_KEY}, @@ -222,7 +221,7 @@ class GradAnomalyData: @staticmethod def get_train_stage(tag_name): """ - :param tag_name: "0:fc2_0/rank0/input", "0:fc1.weight/rank0/post_grad", "0:fc2.weight/rank0/exp_avg_sq" + :param tag_name: "0:fc2.input:0/rank0/actv", "0:fc1.weight/rank0/post_grad", "0:fc2.weight/rank0/exp_avg_sq" :return: int, if forward return 0; if backward return 1; if optimizer return 2 """ key_ = tag_name.split("/")[-1] @@ -255,6 +254,45 @@ class BaseWriterWithAD: self.anomalies = [] self.ndigits = writer_input.ndigits + @staticmethod + def stack_tensors(tensor_list): + """ + Torch not support stack cpu and xpu tensors. Group the tensors into cpu_group and xpu_group, + stack them separately, migrate xpu_group to cpu, and then restore in the order of input. + + :param tensor_list: [tensor(-1.6165), tensor(-1.0985), tensor(-1.7777), tensor(-1.8408, device='npu:0')] + :return: tensor: tensor([-1.6165, -1.0985, -1.7777, -1.8408], device='cpu') + """ + cpu_tensors = [] + xpu_tensors = [] + + # 将张量分别放入cpu_tensors和xpu_tensors列表 + for tensor in tensor_list: + if tensor.device.type == 'cpu': + cpu_tensors.append(tensor) + else: + xpu_tensors.append(tensor) + + # 分别堆叠cpu_tensors和xpu_tensors + cpu_stack = ops.stack(cpu_tensors) if cpu_tensors else ops.tensor([]) + xpu_stack = ops.stack(xpu_tensors).tolist() if xpu_tensors else ops.tensor([]) + + # 按照输入的顺序恢复 + result = [] + cpu_tensors_idx, xpu_tensors_idx = 0, 0 + for tensor in tensor_list: + if tensor.device.type == 'cpu': + result.append(cpu_stack[cpu_tensors_idx]) + cpu_tensors_idx += 1 + else: + result.append(xpu_stack[xpu_tensors_idx]) + xpu_tensors_idx += 1 + + # 将结果堆叠成一个张量 + result = ops.stack(result) + + return result + def get_anomalies(self): """返回已检测到的异常列表 """ @@ -290,8 +328,12 @@ class BaseWriterWithAD: tags = list(itertools.product(metric_value.keys(), op_list)) for op2tensor in metric_value.values(): tensors.extend(op2tensor.values()) + + if not tensors: + return + with _no_grad(): - metric_list = ops.stack(tensors).tolist() if tensors else [] + metric_list = self.stack_tensors(tensors) for tag, metric in zip(tags, metric_list): self.add_scalar(tag, metric, step, need_explain) @@ -353,10 +395,9 @@ class CSVWriterWithAD(BaseWriterWithAD): new_data = [] for name, metric_value in self.context_dict.items(): - if MonitorConst.NAME_SEP not in name: - new_data.append([name] + [step] + metric_value) - else: - new_data.append(name.split(MonitorConst.NAME_SEP) + [step] + metric_value) + new_line = name.split(MonitorConst.NAME_SEP) + metric_value + new_line.insert(2, step) + new_data.append(new_line) new_data = pd.DataFrame(new_data).round(self.ndigits) write_df_to_csv(new_data, filepath, mode='a+', header=False) self.context_dict = defaultdict(list) @@ -379,26 +420,11 @@ class CSVWriterWithAD(BaseWriterWithAD): need_explain = prefix == 'other' super().write_metrics(op_list, metric_value, step, prefix='', need_explain=need_explain) - # generate csv headers - # set hashmap to reduce the number of headers generated. - # 前向的norm用input.ops_和output.ops_,反向的用input_grad.ops_和output_grad.ops_ - if prefix in {"actv", "actv_grad"}: - if prefix == "actv": - input_and_output = [MonitorConst.ACTV_IN, MonitorConst.ACTV_OUT] - else: - input_and_output = [MonitorConst.ACTVGRAD_IN, MonitorConst.ACTVGRAD_OUT] - ops_ = [MonitorConst.DOT.join(i) for i in itertools.product(input_and_output, op_list)] - csv_header = ["module_name", "step", *ops_] + if prefix in [MonitorConst.ACTV, MonitorConst.ACTVGRAD]: + self.header = MonitorConst.CSV_HEADER_XY + ops else: - csv_header = ["param_name", "step", *op_list] - - keys = list(metric_value.keys()) - if keys and MonitorConst.NAME_SEP in keys[0]: - csv_header.insert(0, "vpp_stage") - - self.header = csv_header + self.header = MonitorConst.CSV_HEADER + ops self.write_csv(prefix, step) - self.header = [] def close(self): pass diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py index ffda6d6202eb98b38a9efe6a43d1357c714646e1..5e1269ac9c5bb2b6e2021cdeb1fa3a9c097eceb1 100644 --- a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py +++ b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py @@ -25,12 +25,12 @@ from mindspore import Tensor, mint from mindspore import nn, _no_grad from msprobe.core.common.log import logger -from msprobe.core.common.const import MonitorConst +from msprobe.core.common.const import MonitorConst, Const from msprobe.core.common.file_utils import load_json, save_json from msprobe.mindspore.common.utils import is_mindtorch from msprobe.mindspore.monitor.common_func import is_valid_instance, get_parameters, get_submodules, get_rank from msprobe.mindspore.monitor.utils import get_summary_writer_tag_name, validate_config, step_accumulates_one, \ - is_skip_step, get_metrics, get_single_metrics, get_target_output_dir + is_skip_step, get_metrics, get_target_output_dir from msprobe.mindspore.monitor.module_spec_verifier import validate_config_spec from msprobe.mindspore.monitor.optimizer_collect import OptimizerMonFactory from msprobe.mindspore.monitor.anomaly_detect import AnomalyScanner, AnomalyDataFactory, \ @@ -576,21 +576,22 @@ class TrainerMon: for _, fwd_context in self.module_fwd_hook_context_by_module.items(): if len(fwd_context.actv) == 0: continue - self.summary_writer.write_metrics(self.ops, fwd_context.actv, step, 'actv') + self.summary_writer.write_metrics(self.ops, fwd_context.actv, step, MonitorConst.ACTV) fwd_context.actv.clear() if self.grad_context.actv: - self.summary_writer.write_metrics(self.ops, self.grad_context.actv, step, 'actv_grad') + self.summary_writer.write_metrics(self.ops, self.grad_context.actv, step, MonitorConst.ACTVGRAD) def write_param_tb(self, opt_context): if not self.param_distribution: return - self.summary_writer.write_metrics(self.ops, opt_context.param_metric, opt_context.step, 'param') + self.summary_writer.write_metrics(self.ops, opt_context.param_metric, opt_context.step, MonitorConst.PARAM) def write_mv_tb(self, opt_context): if not self.mv_distribution: return - self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric, opt_context.step, 'exp_avg') - self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric, opt_context.step, 'exp_avg_sq') + self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric, opt_context.step, MonitorConst.EXP_AVG) + self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric, opt_context.step, + MonitorConst.EXP_AVG_SQ) def write_grad_tb(self, step): if not self.wg_distribution: @@ -604,13 +605,28 @@ class TrainerMon: return False return True - def build_tbtag_tensor_map(self, module_name, tag, tensor): - metrics = {} - key = get_summary_writer_tag_name(module_name, tag, str(self.rank)) + def build_tbtag_tensor_map(self, module_name, suffix, tag, tensor): + """ + :param module_name: str of module name + :param suffix: + :param tag: + :param tensor: torch.tensor or tuple/list of torch.tensor + :return: tensor_map + """ + tensor_map = {} if isinstance(tensor, Tensor): - self.register_param_call_id("_hook_module", key) - metrics[key] = tensor - return metrics + tensor = [tensor] + if isinstance(tensor, tuple) or isinstance(tensor, list): + if len(tensor) == 1: + key = get_summary_writer_tag_name(module_name + suffix, tag, self.rank) + self.register_param_call_id("_hook_module", key) + tensor_map[key] = tensor[0] + else: + for i, tensor_i in enumerate(tensor): + key = get_summary_writer_tag_name(module_name + f"_{i}" + suffix, tag, self.rank) + self.register_param_call_id("_hook_module", key) + tensor_map[key] = tensor_i + return tensor_map def register_param_call_id(self, hook_name: str, key: str): """ @@ -655,14 +671,20 @@ class TrainerMon: # nothing to hook return 0 - def fwd_hook_fun(module, module_input, module_output, name): + def fwd_hook_fun(module, args, kwargs, module_output, name): + + module_input = [tensor for tensor in args if isinstance(tensor, Tensor)] + if kwargs: + kwargs_tensors = [tensor for tensor in kwargs.values() if isinstance(tensor, Tensor)] + module_input.extend(kwargs_tensors) + if module not in self.module_fwd_hook_context_by_module: self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name) context: ModuleHookContext = self.module_fwd_hook_context_by_module[module] if not context.struct: context.struct = { - MonitorConst.ACTV_IN: get_param_struct(module_input), - MonitorConst.ACTV_OUT: get_param_struct(module_output) + Const.INPUT: get_param_struct(module_input), + Const.OUTPUT: get_param_struct(module_output) } if self.print_struct: self.module_struct[context.module_name].update(context.struct) @@ -673,31 +695,16 @@ class TrainerMon: self.collect_times): step_accumulates_one(context, self.micro_batch_number) return - if not context.format_by_arg: - context.set_format_by_arg(MonitorConst.ACTV_IN, self.targets) - context.set_format_by_arg(MonitorConst.ACTV_OUT, self.targets) - if not context.format_by_arg: - return - if not context.verified: - if not context.ignore_in: - context.focused_in_col = validate_config_spec(context.format_by_arg[MonitorConst.ACTV_IN], - module_input, context.module_name, - MonitorConst.ACTV_IN) - context.focused_out_col = validate_config_spec(context.format_by_arg[MonitorConst.ACTV_OUT], - module_output, context.module_name, - MonitorConst.ACTV_OUT) - context.verified = True tbtag_tensor_map = {} - if not context.ignore_in: - cared_input = module_input if context.focused_in_col is None else module_input[context.focused_in_col] - tbtag_tensor_map.update( - self.build_tbtag_tensor_map(f'{context.module_name}_{context.micro_step}', MonitorConst.ACTV_IN, - cared_input)) - cared_output = module_output if context.focused_out_col is None else module_output[context.focused_out_col] tbtag_tensor_map.update( - self.build_tbtag_tensor_map(f'{context.module_name}_{context.micro_step}', MonitorConst.ACTV_OUT, - cared_output)) + self.build_tbtag_tensor_map( + f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, module_input)) + tbtag_tensor_map.update( + self.build_tbtag_tensor_map( + f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, module_output)) try: get_metrics(self.ops, tbtag_tensor_map, self.eps, context.actv) except Exception as e: @@ -722,31 +729,16 @@ class TrainerMon: step_accumulates_one(context, self.micro_batch_number) return - if not context.format_by_arg: - context.set_format_by_arg(MonitorConst.ACTVGRAD_IN, self.targets) - context.set_format_by_arg(MonitorConst.ACTVGRAD_OUT, self.targets) - if not context.format_by_arg: - return - if not context.verified: - if not context.ignore_in: - context.focused_in_col = validate_config_spec(context.format_by_arg[MonitorConst.ACTVGRAD_IN], - input_grad, context.module_name, - MonitorConst.ACTVGRAD_IN) - context.focused_out_col = validate_config_spec(context.format_by_arg[MonitorConst.ACTVGRAD_OUT], - output_grad, context.module_name, - MonitorConst.ACTVGRAD_OUT) - context.verified = True - tbtag_tensor_map = {} - if not context.ignore_in: - cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col] - tbtag_tensor_map.update( - self.build_tbtag_tensor_map( - f'{context.module_name}_{context.micro_step}', MonitorConst.ACTVGRAD_IN, cared_input_grad)) - cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col] tbtag_tensor_map.update( - self.build_tbtag_tensor_map(f'{context.module_name}_{context.micro_step}', MonitorConst.ACTVGRAD_OUT, - cared_output_grad)) + self.build_tbtag_tensor_map( + f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, input_grad)) + + tbtag_tensor_map.update( + self.build_tbtag_tensor_map( + f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, output_grad)) if context.micro_step == 0 and context.actvgrad: logger.warning(f"actvgrad context of {context.module_name} is not empty when first micro_step, " @@ -761,8 +753,8 @@ class TrainerMon: return def fwd_hook_fun_wrapper(fwd_hook_fun, name): - def wrapper(module, module_input, module_output): - return fwd_hook_fun(module, module_input, module_output, name) + def wrapper(module, args, kwargs, module_output): + return fwd_hook_fun(module, args, kwargs, module_output, name) return wrapper if self.backward_only and self.forward_only: @@ -774,7 +766,8 @@ class TrainerMon: if not name: continue if not self.backward_only: - handle = submodule.register_forward_hook(fwd_hook_fun_wrapper(fwd_hook_fun, name=name)) + handle = submodule.register_forward_hook(fwd_hook_fun_wrapper(fwd_hook_fun, name=name), + with_kwargs=True) self.handles['xy'].append(handle) if not self.forward_only: handle = submodule.register_backward_hook(bwd_hook_fun) diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/anomaly_detect.py b/debug/accuracy_tools/msprobe/pytorch/monitor/anomaly_detect.py index 63f20b1928c80e1e29d7cb8224f267c246fcaa8b..64b1ca5ea5ce87e853d4a5384da3a51208c60bfd 100644 --- a/debug/accuracy_tools/msprobe/pytorch/monitor/anomaly_detect.py +++ b/debug/accuracy_tools/msprobe/pytorch/monitor/anomaly_detect.py @@ -254,6 +254,45 @@ class BaseWriterWithAD: self.anomalies = [] self.ndigits = writer_input.ndigits + @staticmethod + def stack_tensors(tensor_list): + """ + Torch not support stack cpu and xpu tensors. Group the tensors into cpu_group and xpu_group, + stack them separately, migrate xpu_group to cpu, and then restore in the order of input. + + :param tensor_list: [tensor(-1.6165), tensor(-1.0985), tensor(-1.7777), tensor(-1.8408, device='npu:0')] + :return: tensor: tensor([-1.6165, -1.0985, -1.7777, -1.8408], device='cpu') + """ + cpu_tensors = [] + xpu_tensors = [] + + # 将张量分别放入cpu_tensors和xpu_tensors列表 + for tensor in tensor_list: + if tensor.device.type == 'cpu': + cpu_tensors.append(tensor) + else: + xpu_tensors.append(tensor) + + # 分别堆叠cpu_tensors和xpu_tensors + cpu_stack = torch.stack(cpu_tensors) if cpu_tensors else torch.tensor([]) + xpu_stack = torch.stack(xpu_tensors).cpu() if xpu_tensors else torch.tensor([]) + + # 按照输入的顺序恢复 + result = [] + cpu_tensors_idx, xpu_tensors_idx = 0, 0 + for tensor in tensor_list: + if tensor.device.type == 'cpu': + result.append(cpu_stack[cpu_tensors_idx]) + cpu_tensors_idx += 1 + else: + result.append(xpu_stack[xpu_tensors_idx]) + xpu_tensors_idx += 1 + + # 将结果堆叠成一个张量 + result = torch.stack(result) + + return result + def get_anomalies(self): """返回已检测到的异常列表 """ @@ -299,7 +338,7 @@ class BaseWriterWithAD: end = (i+1) * MonitorConst.SLICE_SIZE if begin == len(tensors): continue - metric_list = torch.stack(tensors[begin:end]).cpu() + metric_list = self.stack_tensors(tensors[begin:end]) for tag, metric in zip(tags[begin:end], metric_list): self.add_scalar(tag, metric, step) diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/csv2tb.py b/debug/accuracy_tools/msprobe/pytorch/monitor/csv2tb.py index 7fbcac84efb38814e01c2cc3cf5b3696a0c1afd2..aad5ba73f1f47cc5fc91df902a6b1e6930db76b7 100644 --- a/debug/accuracy_tools/msprobe/pytorch/monitor/csv2tb.py +++ b/debug/accuracy_tools/msprobe/pytorch/monitor/csv2tb.py @@ -15,6 +15,7 @@ import datetime import os import re +import time from multiprocessing import Process import pytz @@ -113,6 +114,7 @@ def csv2tb_by_step_work(target_output_dirs, output_dirpath, data_type_list): all_step_result = update_dict(all_step_result, parse_step_result) if all_step_result: write_step(output_dirpath, all_step_result, rank, data_type) + time.sleep(1) def check_process_num(process_num): diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py index cbc6f58fef69127ef577878422a157bda267b774..0efbdfd2454e8459a70db35c4f24c9e62e50ba4d 100644 --- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py +++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py @@ -440,10 +440,28 @@ class TrainerMon: return self.tensor_metrics.stat_insert(target_tensor, ops_list, module_name, tensor_name, rank) - def build_tbtag_tensor_map(self, module_name, tag, tensor): - key = get_summary_writer_tag_name(module_name, tag, self.rank) - self.register_param_call_id("_hook_module", key) - return {key: tensor} + def build_tbtag_tensor_map(self, module_name, suffix, tag, tensor): + """ + :param module_name: str of module name + :param suffix: + :param tag: + :param tensor: torch.tensor or tuple/list of torch.tensor + :return: tensor_map + """ + tensor_map = {} + if isinstance(tensor, torch.Tensor): + tensor = [tensor] + if isinstance(tensor, tuple) or isinstance(tensor, list): + if len(tensor) == 1: + key = get_summary_writer_tag_name(module_name + suffix, tag, self.rank) + self.register_param_call_id("_hook_module", key) + tensor_map[key] = tensor[0] + else: + for i, tensor_i in enumerate(tensor): + key = get_summary_writer_tag_name(module_name + f"_{i}" + suffix, tag, self.rank) + self.register_param_call_id("_hook_module", key) + tensor_map[key] = tensor_i + return tensor_map def generate_param_map(self, tag, param_tensor): metrics = {} @@ -920,11 +938,17 @@ class TrainerMon: # nothing to hook return 0 - def fwd_hook_fun(module, module_input, module_output, name): + def fwd_hook_fun(module, args, kwargs, module_output, name): if not module.training or is_recomputation(): # 1 only monitor training stage. # 2 when open recompute, skip recomputed forward stage. return + + module_input = [tensor for tensor in args if torch.is_tensor(tensor)] + if kwargs: + kwargs_tensors = [tensor for tensor in kwargs.values() if torch.is_tensor(tensor)] + module_input.extend(kwargs_tensors) + if module not in self.module_fwd_hook_context_by_module: self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name) context: ModuleHookContext = self.module_fwd_hook_context_by_module[module] @@ -936,31 +960,16 @@ class TrainerMon: if self.print_struct: self.module_struct[context.module_name].update(context.struct) return - if not context.format_by_arg: - context.set_format_by_arg(Const.INPUT, self.config['targets']) - context.set_format_by_arg(Const.OUTPUT, self.config['targets']) - if not context.format_by_arg: - return - if not context.verified: - context.focused_in_col = validate_config_spec(context.format_by_arg[Const.INPUT], - module_input, context.module_name, - Const.INPUT) - context.focused_out_col = validate_config_spec(context.format_by_arg[Const.OUTPUT], - module_output, context.module_name, - Const.OUTPUT) - context.verified = True - # expect output be tensor type + tbtag_tensor_map = {} - cared_input = module_input if context.focused_in_col is None else module_input[context.focused_in_col] tbtag_tensor_map.update( self.build_tbtag_tensor_map( - f'{context.module_name}.{Const.INPUT}{MonitorConst.NAME_SEP}{context.micro_step}', - MonitorConst.ACTV, cared_input)) - cared_output = module_output if context.focused_out_col is None else module_output[context.focused_out_col] + f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, module_input)) tbtag_tensor_map.update( self.build_tbtag_tensor_map( - f'{context.module_name}.{Const.OUTPUT}{MonitorConst.NAME_SEP}{context.micro_step}', - MonitorConst.ACTV, cared_output)) + f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, module_output)) get_metrics(self.ops, tbtag_tensor_map, self.eps, context.actv) context.micro_step += 1 @@ -978,31 +987,17 @@ class TrainerMon: if self.print_struct: self.module_struct[context.module_name].update(context.struct) return - if not context.format_by_arg: - context.set_format_by_arg(MonitorConst.INPUT_GRAD, self.config['targets']) - context.set_format_by_arg(MonitorConst.OUTPUT_GRAD, self.config['targets']) - if not context.format_by_arg: - return - if not context.verified: - context.focused_in_col = validate_config_spec( - context.format_by_arg[MonitorConst.INPUT_GRAD], - input_grad, context.module_name, MonitorConst.INPUT_GRAD) - context.focused_out_col = validate_config_spec( - context.format_by_arg[MonitorConst.OUTPUT_GRAD], - output_grad, context.module_name, MonitorConst.OUTPUT_GRAD) - context.verified = True tbtag_tensor_map = {} - cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col] tbtag_tensor_map.update( self.build_tbtag_tensor_map( - f'{context.module_name}.{Const.INPUT}{MonitorConst.NAME_SEP}{context.micro_step}', - MonitorConst.ACTV, cared_input_grad)) - cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col] + f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, input_grad)) + tbtag_tensor_map.update( self.build_tbtag_tensor_map( - f'{context.module_name}.{Const.OUTPUT}{MonitorConst.NAME_SEP}{context.micro_step}', - MonitorConst.ACTV, cared_output_grad)) + f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}', + MonitorConst.ACTV, output_grad)) if context.micro_step == 0 and context.actvgrad: logger.warning(f"actvgrad context of {context.module_name} is not empty when first micro_step, " @@ -1028,7 +1023,7 @@ class TrainerMon: if submodule.__class__.__name__ == "FullyShardedDataParallel": continue if not self.backward_only: - handle = submodule.register_forward_hook(partial(fwd_hook_fun, name=name)) + handle = submodule.register_forward_hook(partial(fwd_hook_fun, name=name), with_kwargs=True) self.handles['xy'].append(handle) if not self.forward_only and not self.has_register_backward_hook(name, submodule): handle = submodule.register_full_backward_hook(bwd_hook_fun) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/monitor/test_csv2tb.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/monitor/test_csv2tb.py index 4178e2ef8fbfb2c2bafa90b32fa92d622b95e3cd..26bec0abdd88bfa10f5cdf9aa7271de57de66a8c 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/monitor/test_csv2tb.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/monitor/test_csv2tb.py @@ -17,7 +17,6 @@ import os import shutil import random import unittest -import pytest import torch import numpy as np import torch.nn as nn @@ -30,13 +29,9 @@ from msprobe.pytorch.hook_module.api_register import get_api_register get_api_register().restore_all_api() - base_dir = os.path.dirname(os.path.realpath(__file__)) config_json_path = os.path.join(base_dir, "config", "all_config.json") monitor_output = os.path.join(base_dir, "./monitor_output_csv2tb") -os.environ[MonitorConst.MONITOR_OUTPUT_DIR] = monitor_output -timestamp_dirpath = None -csv2tb_dirpath = None def seed_all(seed=1234, mode=False): @@ -46,8 +41,8 @@ def seed_all(seed=1234, mode=False): torch.manual_seed(seed) torch.use_deterministic_algorithms(mode) -seed_all() +seed_all() inputs = [torch.rand(10, 10) for _ in range(10)] labels = [torch.randint(0, 5, (10,)) for _ in range(10)] @@ -65,31 +60,6 @@ class MockModule(nn.Module): return x2 -def data_collect(): - loss_fun = nn.CrossEntropyLoss() - test_module = MockModule() - nn.init.constant_(test_module.linear.weight, 1.0) - nn.init.constant_(test_module.linear.bias, 1.0) - optimizer = torch.optim.Adam(test_module.parameters()) - - monitor = TrainerMon(config_json_path, params_have_main_grad=False) - monitor.set_monitor(test_module, grad_acc_steps=1, optimizer=optimizer) - - for input_data, label in zip(inputs, labels): - output = test_module(input_data) - loss = loss_fun(output, label) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - global timestamp_dirpath, csv2tb_dirpath - timestamp_dirpath = os.path.join(monitor_output, os.listdir(monitor_output)[0]) - csv2tensorboard_by_step(monitor_output) - for dirname in os.listdir(monitor_output): - if "csv2tensorboard" in dirname: - csv2tb_dirpath = os.path.join(monitor_output, dirname, "rank0") - - def extract_scalars_from_tensorboard(log_dir): # 初始化 EventAccumulator event_acc = EventAccumulator(log_dir) @@ -144,97 +114,102 @@ def compare_scalar_dicts(dict1, dict2): return True -@pytest.fixture(scope="session") -def setup_all(): - data_collect() - yield - shutil.rmtree(monitor_output) - -@pytest.mark.usefixtures("setup_all") class TestGradMonitor(unittest.TestCase): + timestamp_dirpath = None + csv2tb_dirpath = None + + @classmethod + def setUpClass(cls): + + os.environ[MonitorConst.MONITOR_OUTPUT_DIR] = monitor_output + if os.path.exists(monitor_output): + shutil.rmtree(monitor_output) + + loss_fun = nn.CrossEntropyLoss() + test_module = MockModule() + nn.init.constant_(test_module.linear.weight, 1.0) + nn.init.constant_(test_module.linear.bias, 1.0) + optimizer = torch.optim.Adam(test_module.parameters()) + + monitor = TrainerMon(config_json_path, params_have_main_grad=False) + monitor.set_monitor(test_module, grad_acc_steps=1, optimizer=optimizer) + + for input_data, label in zip(inputs, labels): + output = test_module(input_data) + loss = loss_fun(output, label) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + cls.timestamp_dirpath = os.path.join(monitor_output, os.listdir(monitor_output)[0]) + csv2tensorboard_by_step(monitor_output) + for dirname in os.listdir(monitor_output): + if "csv2tensorboard" in dirname: + cls.csv2tb_dirpath = os.path.join(monitor_output, dirname, "rank0") + os.environ.pop(MonitorConst.MONITOR_OUTPUT_DIR) def setUp(self): self.maxDiff = None - + def test_actv(self): - data = parse_step_fn(os.path.join(timestamp_dirpath,"actv_0-2.csv")) + data = parse_step_fn(os.path.join(self.timestamp_dirpath, "actv_0-2.csv")) result = { 'vp0:.input:micro0': { - 0: {'nans': 0.0,'norm': 5.550016}, - 1: {'nans': 0.0,'norm': 5.975112}, - 2: {'nans': 0.0,'norm': 5.789881} - }, + 0: {'nans': 0.0, 'norm': 5.550016}, + 1: {'nans': 0.0, 'norm': 5.975112}, + 2: {'nans': 0.0, 'norm': 5.789881} + }, 'vp0:.output:micro0': { - 0: {'nans': 0.0,'norm': 41.842655}, - 1: {'nans': 0.0,'norm': 44.40981}, - 2: {'nans': 0.0,'norm': 43.578354} - }, + 0: {'nans': 0.0, 'norm': 41.842655}, + 1: {'nans': 0.0, 'norm': 44.40981}, + 2: {'nans': 0.0, 'norm': 43.578354} + }, 'vp0:linear.input:micro0': { - 0: {'nans': 0.0,'norm': 5.550016}, - 1: {'nans': 0.0,'norm': 5.975112}, - 2: {'nans': 0.0,'norm': 5.789881} - }, + 0: {'nans': 0.0, 'norm': 5.550016}, + 1: {'nans': 0.0, 'norm': 5.975112}, + 2: {'nans': 0.0, 'norm': 5.789881} + }, 'vp0:linear.output:micro0': { - 0: {'nans': 0.0,'norm': 41.842655}, - 1: {'nans': 0.0,'norm': 44.40981}, - 2: {'nans': 0.0,'norm': 43.578354} - }, + 0: {'nans': 0.0, 'norm': 41.842655}, + 1: {'nans': 0.0, 'norm': 44.40981}, + 2: {'nans': 0.0, 'norm': 43.578354} + }, 'vp0:relu.input:micro0': { - 0: {'nans': 0.0,'norm': 41.842655}, - 1: {'nans': 0.0,'norm': 44.40981}, - 2: {'nans': 0.0,'norm': 43.578354} - }, + 0: {'nans': 0.0, 'norm': 41.842655}, + 1: {'nans': 0.0, 'norm': 44.40981}, + 2: {'nans': 0.0, 'norm': 43.578354} + }, 'vp0:relu.output:micro0': { - 0: {'nans': 0.0,'norm': 41.842655}, - 1: {'nans': 0.0,'norm': 44.40981}, - 2: {'nans': 0.0,'norm': 43.578354} - } + 0: {'nans': 0.0, 'norm': 41.842655}, + 1: {'nans': 0.0, 'norm': 44.40981}, + 2: {'nans': 0.0, 'norm': 43.578354} } + } self.assertEqual(dict_equal(data, result), True) - tb_data = extract_scalars_from_tensorboard(os.path.join(csv2tb_dirpath, "actv")) + tb_data = extract_scalars_from_tensorboard(os.path.join(self.csv2tb_dirpath, "actv")) print(tb_data) tb_result = { 'vp0:.input:micro0/nans': [(0, 0.0), - (1, 0.0), - (2, 0.0), - (3, 0.0), - (4, 0.0), - (5, 0.0), - (6, 0.0), - (7, 0.0), - (8, 0.0), - (9, 0.0)], + (1, 0.0), + (2, 0.0), + (3, 0.0), + (4, 0.0), + (5, 0.0), + (6, 0.0), + (7, 0.0), + (8, 0.0), + (9, 0.0)], 'vp0:.input:micro0/norm': [(0, 5.550015926361084), - (1, 5.975111961364746), - (2, 5.789881229400635), - (3, 6.052319049835205), - (4, 5.573315143585205), - (5, 5.864360809326172), - (6, 5.292460918426514), - (7, 5.477899074554443), - (8, 5.884613990783691), - (9, 5.456457138061523)], + (1, 5.975111961364746), + (2, 5.789881229400635), + (3, 6.052319049835205), + (4, 5.573315143585205), + (5, 5.864360809326172), + (6, 5.292460918426514), + (7, 5.477899074554443), + (8, 5.884613990783691), + (9, 5.456457138061523)], 'vp0:.output:micro0/nans': [(0, 0.0), - (1, 0.0), - (2, 0.0), - (3, 0.0), - (4, 0.0), - (5, 0.0), - (6, 0.0), - (7, 0.0), - (8, 0.0), - (9, 0.0)], - 'vp0:.output:micro0/norm': [(0, 41.842655181884766), - (1, 44.40980911254883), - (2, 43.57835388183594), - (3, 45.83631134033203), - (4, 42.0673828125), - (5, 43.46839141845703), - (6, 39.77947235107422), - (7, 40.200843811035156), - (8, 44.453147888183594), - (9, 40.841522216796875)], - 'vp0:linear.input:micro0/nans': [(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), @@ -244,117 +219,137 @@ class TestGradMonitor(unittest.TestCase): (7, 0.0), (8, 0.0), (9, 0.0)], + 'vp0:.output:micro0/norm': [(0, 41.842655181884766), + (1, 44.40980911254883), + (2, 43.57835388183594), + (3, 45.83631134033203), + (4, 42.0673828125), + (5, 43.46839141845703), + (6, 39.77947235107422), + (7, 40.200843811035156), + (8, 44.453147888183594), + (9, 40.841522216796875)], + 'vp0:linear.input:micro0/nans': [(0, 0.0), + (1, 0.0), + (2, 0.0), + (3, 0.0), + (4, 0.0), + (5, 0.0), + (6, 0.0), + (7, 0.0), + (8, 0.0), + (9, 0.0)], 'vp0:linear.input:micro0/norm': [(0, 5.550015926361084), - (1, 5.975111961364746), - (2, 5.789881229400635), - (3, 6.052319049835205), - (4, 5.573315143585205), - (5, 5.864360809326172), - (6, 5.292460918426514), - (7, 5.477899074554443), - (8, 5.884613990783691), - (9, 5.456457138061523)], + (1, 5.975111961364746), + (2, 5.789881229400635), + (3, 6.052319049835205), + (4, 5.573315143585205), + (5, 5.864360809326172), + (6, 5.292460918426514), + (7, 5.477899074554443), + (8, 5.884613990783691), + (9, 5.456457138061523)], 'vp0:linear.output:micro0/nans': [(0, 0.0), - (1, 0.0), - (2, 0.0), - (3, 0.0), - (4, 0.0), - (5, 0.0), - (6, 0.0), - (7, 0.0), - (8, 0.0), - (9, 0.0)], + (1, 0.0), + (2, 0.0), + (3, 0.0), + (4, 0.0), + (5, 0.0), + (6, 0.0), + (7, 0.0), + (8, 0.0), + (9, 0.0)], 'vp0:linear.output:micro0/norm': [(0, 41.842655181884766), - (1, 44.40980911254883), - (2, 43.57835388183594), - (3, 45.83631134033203), - (4, 42.0673828125), - (5, 43.46839141845703), - (6, 39.77947235107422), - (7, 40.200843811035156), - (8, 44.453147888183594), - (9, 40.841522216796875)], + (1, 44.40980911254883), + (2, 43.57835388183594), + (3, 45.83631134033203), + (4, 42.0673828125), + (5, 43.46839141845703), + (6, 39.77947235107422), + (7, 40.200843811035156), + (8, 44.453147888183594), + (9, 40.841522216796875)], 'vp0:relu.input:micro0/nans': [(0, 0.0), - (1, 0.0), - (2, 0.0), - (3, 0.0), - (4, 0.0), - (5, 0.0), - (6, 0.0), - (7, 0.0), - (8, 0.0), - (9, 0.0)], + (1, 0.0), + (2, 0.0), + (3, 0.0), + (4, 0.0), + (5, 0.0), + (6, 0.0), + (7, 0.0), + (8, 0.0), + (9, 0.0)], 'vp0:relu.input:micro0/norm': [(0, 41.842655181884766), - (1, 44.40980911254883), - (2, 43.57835388183594), - (3, 45.83631134033203), - (4, 42.0673828125), - (5, 43.46839141845703), - (6, 39.77947235107422), - (7, 40.200843811035156), - (8, 44.453147888183594), - (9, 40.841522216796875)], + (1, 44.40980911254883), + (2, 43.57835388183594), + (3, 45.83631134033203), + (4, 42.0673828125), + (5, 43.46839141845703), + (6, 39.77947235107422), + (7, 40.200843811035156), + (8, 44.453147888183594), + (9, 40.841522216796875)], 'vp0:relu.output:micro0/nans': [(0, 0.0), - (1, 0.0), - (2, 0.0), - (3, 0.0), - (4, 0.0), - (5, 0.0), - (6, 0.0), - (7, 0.0), - (8, 0.0), - (9, 0.0)], + (1, 0.0), + (2, 0.0), + (3, 0.0), + (4, 0.0), + (5, 0.0), + (6, 0.0), + (7, 0.0), + (8, 0.0), + (9, 0.0)], 'vp0:relu.output:micro0/norm': [(0, 41.842655181884766), - (1, 44.40980911254883), - (2, 43.57835388183594), - (3, 45.83631134033203), - (4, 42.0673828125), - (5, 43.46839141845703), - (6, 39.77947235107422), - (7, 40.200843811035156), - (8, 44.453147888183594), - (9, 40.841522216796875)]} + (1, 44.40980911254883), + (2, 43.57835388183594), + (3, 45.83631134033203), + (4, 42.0673828125), + (5, 43.46839141845703), + (6, 39.77947235107422), + (7, 40.200843811035156), + (8, 44.453147888183594), + (9, 40.841522216796875)]} self.assertEqual(compare_scalar_dicts(tb_data, tb_result), True) - def test_actv_grad(self): - data = parse_step_fn(os.path.join(timestamp_dirpath,"actv_grad_0-2.csv")) + data = parse_step_fn(os.path.join(self.timestamp_dirpath, "actv_grad_0-2.csv")) nan = np.nan result = { 'vp0:.input:micro0': { - 0: {'norm': nan, 'nans': nan}, - 1: {'norm': nan, 'nans': nan}, + 0: {'norm': nan, 'nans': nan}, + 1: {'norm': nan, 'nans': nan}, 2: {'norm': nan, 'nans': nan} - }, + }, 'vp0:.output:micro0': { - 0: {'norm': 0.282843, 'nans': 0.0}, - 1: {'norm': 0.282617, 'nans': 0.0}, + 0: {'norm': 0.282843, 'nans': 0.0}, + 1: {'norm': 0.282617, 'nans': 0.0}, 2: {'norm': 0.282655, 'nans': 0.0} - }, + }, 'vp0:relu.input:micro0': { - 0: {'norm': 0.282843, 'nans': 0.0}, - 1: {'norm': 0.282617, 'nans': 0.0}, + 0: {'norm': 0.282843, 'nans': 0.0}, + 1: {'norm': 0.282617, 'nans': 0.0}, 2: {'norm': 0.282655, 'nans': 0.0} - }, + }, 'vp0:relu.output:micro0': { - 0: {'norm': 0.282843, 'nans': 0.0}, - 1: {'norm': 0.282617, 'nans': 0.0}, + 0: {'norm': 0.282843, 'nans': 0.0}, + 1: {'norm': 0.282617, 'nans': 0.0}, 2: {'norm': 0.282655, 'nans': 0.0} - }, + }, 'vp0:linear.input:micro0': { - 0: {'norm': nan, 'nans': nan}, - 1: {'norm': nan, 'nans': nan}, + 0: {'norm': nan, 'nans': nan}, + 1: {'norm': nan, 'nans': nan}, 2: {'norm': nan, 'nans': nan} - }, + }, 'vp0:linear.output:micro0': { - 0: {'norm': 0.282843, 'nans': 0.0}, - 1: {'norm': 0.282617, 'nans': 0.0}, + 0: {'norm': 0.282843, 'nans': 0.0}, + 1: {'norm': 0.282617, 'nans': 0.0}, 2: {'norm': 0.282655, 'nans': 0.0} - } } + } + print(data) self.assertEqual(dict_equal(data, result), True) - - tb_data = extract_scalars_from_tensorboard(os.path.join(csv2tb_dirpath, "actv_grad")) + + tb_data = extract_scalars_from_tensorboard(os.path.join(self.csv2tb_dirpath, "actv_grad")) tb_result = { 'vp0:.input:micro0/nans': [(0, nan), (1, nan), @@ -475,88 +470,91 @@ class TestGradMonitor(unittest.TestCase): (6, 0.28316599130630493), (7, 0.28274500370025635), (8, 0.2833530008792877), - (9, 0.2825529873371124)]} + (9, 0.2825529873371124)] + } + print(tb_data) self.assertEqual(compare_scalar_dicts(tb_data, tb_result), True) - def test_param(self): - data = parse_step_fn(os.path.join(timestamp_dirpath,"param_0-2.csv")) + data = parse_step_fn(os.path.join(self.timestamp_dirpath, "param_0-2.csv")) result = { 'vp0:linear.bias': { 0: {'nans': 0.0, 'norm': 2.236068}, 1: {'nans': 0.0, 'norm': 2.236198}, 2: {'nans': 0.0, 'norm': 2.235769} - }, + }, 'vp0:linear.weight': { 0: {'nans': 0.0, 'norm': 7.071068}, 1: {'nans': 0.0, 'norm': 7.068808}, 2: {'nans': 0.0, 'norm': 7.06771} - } } + } self.assertEqual(dict_equal(data, result), True) - tb_data = extract_scalars_from_tensorboard(os.path.join(csv2tb_dirpath, "param")) + tb_data = extract_scalars_from_tensorboard(os.path.join(self.csv2tb_dirpath, "param")) tb_result = { 'vp0:linear.weight/norm': [ - (0, 7.071067810058594), - (1, 7.068808078765869), - (2, 7.067709922790527), - (3, 7.0673418045043945), - (4, 7.066926956176758), - (5, 7.066311836242676), - (6, 7.065629959106445), - (7, 7.065262794494629), - (8, 7.065001964569092), - (9, 7.064840793609619)], + (0, 7.071067810058594), + (1, 7.068808078765869), + (2, 7.067709922790527), + (3, 7.0673418045043945), + (4, 7.066926956176758), + (5, 7.066311836242676), + (6, 7.065629959106445), + (7, 7.065262794494629), + (8, 7.065001964569092), + (9, 7.064840793609619)], 'vp0:linear.weight/nans': [ - (0, 0.0), - (1, 0.0), - (2, 0.0), - (3, 0.0), - (4, 0.0), - (5, 0.0), - (6, 0.0), - (7, 0.0), - (8, 0.0), - (9, 0.0)], + (0, 0.0), + (1, 0.0), + (2, 0.0), + (3, 0.0), + (4, 0.0), + (5, 0.0), + (6, 0.0), + (7, 0.0), + (8, 0.0), + (9, 0.0)], 'vp0:linear.bias/norm': [ - (0, 2.2360680103302), - (1, 2.2361979484558105), - (2, 2.235769033432007), - (3, 2.235903024673462), - (4, 2.2360129356384277), - (5, 2.2359039783477783), - (6, 2.2357990741729736), - (7, 2.2357349395751953), - (8, 2.2356700897216797), - (9, 2.235619068145752)], + (0, 2.2360680103302), + (1, 2.2361979484558105), + (2, 2.235769033432007), + (3, 2.235903024673462), + (4, 2.2360129356384277), + (5, 2.2359039783477783), + (6, 2.2357990741729736), + (7, 2.2357349395751953), + (8, 2.2356700897216797), + (9, 2.235619068145752) + ], 'vp0:linear.bias/nans': [ - (0, 0.0), - (1, 0.0), - (2, 0.0), - (3, 0.0), - (4, 0.0), - (5, 0.0), - (6, 0.0), - (7, 0.0), - (8, 0.0), - (9, 0.0)] - } + (0, 0.0), + (1, 0.0), + (2, 0.0), + (3, 0.0), + (4, 0.0), + (5, 0.0), + (6, 0.0), + (7, 0.0), + (8, 0.0), + (9, 0.0) + ] + } self.assertEqual(compare_scalar_dicts(tb_data, tb_result), True) def test_exp_avg(self): - data = parse_step_fn(os.path.join(timestamp_dirpath,"exp_avg_0-2.csv")) + data = parse_step_fn(os.path.join(self.timestamp_dirpath, "exp_avg_0-2.csv")) result = { 'vp0:linear.bias': { 1: {'nans': 0.0, 'norm': 0.024495}, 2: {'nans': 0.0, 'norm': 0.052203} - }, + }, 'vp0:linear.weight': { 1: {'nans': 0.0, 'norm': 0.052394}, 2: {'nans': 0.0, 'norm': 0.099221} - } } + } self.assertEqual(dict_equal(data, result), True) - tb_data = extract_scalars_from_tensorboard(os.path.join(csv2tb_dirpath, "exp_avg")) + tb_data = extract_scalars_from_tensorboard(os.path.join(self.csv2tb_dirpath, "exp_avg")) tb_result = { 'vp0:linear.bias/nans': [(1, 0.0), (2, 0.0), @@ -597,19 +595,19 @@ class TestGradMonitor(unittest.TestCase): self.assertEqual(compare_scalar_dicts(tb_data, tb_result), True) def test_exp_avg_sq(self): - data = parse_step_fn(os.path.join(timestamp_dirpath,"exp_avg_sq_0-2.csv")) + data = parse_step_fn(os.path.join(self.timestamp_dirpath, "exp_avg_sq_0-2.csv")) result = { 'vp0:linear.bias': { 1: {'nans': 0.0, 'norm': 4.2e-05}, 2: {'nans': 0.0, 'norm': 9.6e-05} - }, + }, 'vp0:linear.weight': { 1: {'nans': 0.0, 'norm': 6.7e-05}, 2: {'nans': 0.0, 'norm': 0.000126} - } } + } self.assertEqual(dict_equal(data, result), True) - tb_data = extract_scalars_from_tensorboard(os.path.join(csv2tb_dirpath, "exp_avg_sq")) + tb_data = extract_scalars_from_tensorboard(os.path.join(self.csv2tb_dirpath, "exp_avg_sq")) tb_result = { 'vp0:linear.bias/nans': [(1, 0.0), (2, 0.0), @@ -648,23 +646,23 @@ class TestGradMonitor(unittest.TestCase): (8, 0.00028700000257231295), (9, 0.0003060000017285347)]} self.assertEqual(compare_scalar_dicts(tb_data, tb_result), True) - + def test_grad_reduced(self): - data = parse_step_fn(os.path.join(timestamp_dirpath,"grad_reduced_0-2.csv")) + data = parse_step_fn(os.path.join(self.timestamp_dirpath, "grad_reduced_0-2.csv")) result = { 'vp0:linear.bias': { 0: {'nans': 0.0, 'norm': 0.244949}, 1: {'nans': 0.0, 'norm': 0.314345}, 2: {'nans': 0.0, 'norm': 0.281475} - }, + }, 'vp0:linear.weight': { 0: {'nans': 0.0, 'norm': 0.523935}, 1: {'nans': 0.0, 'norm': 0.595672}, 2: {'nans': 0.0, 'norm': 0.497603} - } } + } self.assertEqual(dict_equal(data, result), True) - tb_data = extract_scalars_from_tensorboard(os.path.join(csv2tb_dirpath, "grad_reduced")) + tb_data = extract_scalars_from_tensorboard(os.path.join(self.csv2tb_dirpath, "grad_reduced")) tb_result = { 'vp0:linear.bias/nans': [(0, 0.0), (1, 0.0), @@ -707,24 +705,24 @@ class TestGradMonitor(unittest.TestCase): (8, 0.3234719932079315), (9, 0.32385098934173584)]} self.assertEqual(compare_scalar_dicts(tb_data, tb_result), True) - + def test_grad_unreduced(self): - data = parse_step_fn(os.path.join(timestamp_dirpath,"grad_unreduced_0-2.csv")) + data = parse_step_fn(os.path.join(self.timestamp_dirpath, "grad_unreduced_0-2.csv")) result = { 'vp0:linear.bias': { 0: {'nans': 0.0, 'norm': 0.244949}, 1: {'nans': 0.0, 'norm': 0.314345}, 2: {'nans': 0.0, 'norm': 0.281475} - }, + }, 'vp0:linear.weight': { 0: {'nans': 0.0, 'norm': 0.523935}, 1: {'nans': 0.0, 'norm': 0.595672}, 2: {'nans': 0.0, 'norm': 0.497603} - } } + } self.assertEqual(dict_equal(data, result), True) - tb_data = extract_scalars_from_tensorboard(os.path.join(csv2tb_dirpath, "grad_unreduced")) + tb_data = extract_scalars_from_tensorboard(os.path.join(self.csv2tb_dirpath, "grad_unreduced")) tb_result = { 'vp0:linear.bias/nans': [(0, 0.0), (1, 0.0), @@ -767,3 +765,7 @@ class TestGradMonitor(unittest.TestCase): (8, 0.3234719932079315), (9, 0.32385098934173584)]} self.assertEqual(compare_scalar_dicts(tb_data, tb_result), True) + + +if __name__ == '__main__': + unittest.main()