diff --git a/debug/accuracy_tools/grad_tool/README.md b/debug/accuracy_tools/grad_tool/README.md index 78acffbe685f4b724dd1cff82a48236276e8b16f..b5c11bb2045661e5df2d47c6897205eea452d775 100644 --- a/debug/accuracy_tools/grad_tool/README.md +++ b/debug/accuracy_tools/grad_tool/README.md @@ -77,27 +77,18 @@ -2. 在训练流程执行之前传入 config.yaml 的路径实例化一个 GradientDumper 对象。实例代码如下: +2. 在模型构造完成时插入如下代码: ```python from grad_tool.grad_monitor import GradientMonitor gm = GradientMonitor("config_path") + gm.monitor(model) ``` + config_path: 传入config.yaml 的路径实例化一个 GradientMonitor 对象 + + model: 传入刚构造好的模型进行监控 -3. 插入代码监控模型,有两种方法,选择其一即可: - - 推荐:在训练流程中,反向执行之后梯度清零之前的位置,调用 gm.save_grad 并将模型作为参数传入 - - ```python - gm.save_grad(model) - ``` - - 另一种:在训练开始前,调用 gm.monitor 并将模型作为参数传入。这种方式目前不稳定。 - - ```python - gm.monitor(model) - ``` ### 输出结果 **输出目录结构**(level 为 L2) diff --git a/debug/accuracy_tools/grad_tool/grad_comparator.py b/debug/accuracy_tools/grad_tool/grad_comparator.py index ff6302d4fc29d76796cdb533dc6ba3622bcc25a5..f2daf988cc4b45c48373f9f00a03d826da802513 100644 --- a/debug/accuracy_tools/grad_tool/grad_comparator.py +++ b/debug/accuracy_tools/grad_tool/grad_comparator.py @@ -114,8 +114,8 @@ class GradComparator: @staticmethod def _calculate_similarity(pt_file1: str, pt_file2: str): - tensor1 = torch.load(pt_file1) - tensor2 = torch.load(pt_file2) + tensor1 = torch.load(pt_file1, map_location=torch.device("cpu")) + tensor2 = torch.load(pt_file2, map_location=torch.device("cpu")) if tensor1.shape != tensor2.shape: raise Exception(f"tensor shape is not equal: {pt_file1}, {pt_file2}") if tensor1.dtype != torch.bool: diff --git a/debug/accuracy_tools/grad_tool/grad_monitor.py b/debug/accuracy_tools/grad_tool/grad_monitor.py index bf1a6da0ba31af41d6a7e43b6b3b4bacba75b4b8..9324d60932e80d2aa2c2063f15c41ee26b6b2529 100644 --- a/debug/accuracy_tools/grad_tool/grad_monitor.py +++ b/debug/accuracy_tools/grad_tool/grad_monitor.py @@ -1,9 +1,11 @@ import os +from collections import defaultdict import torch +from torch.optim.optimizer import register_optimizer_step_pre_hook from grad_tool.level_adapter import Level, LevelAdapter from grad_tool.grad_stat_csv import GradStatCsv -from grad_tool.utils import get_config, check_numeral_list_ascend, ListCache, data_in_list_target,\ - write_csv, get_rank_id, print_info_log, create_directory, print_warn_log +from grad_tool.utils import get_config, check_numeral_list_ascend, data_in_list_target,\ + write_csv, get_rank_id, print_info_log, create_directory, print_warn_log, print_rank_0 class GradientMonitor: @@ -27,81 +29,40 @@ class GradientMonitor: else: print_warn_log(f"the file in {self._output_path} will be recoverd") self._step = -1 - self._list_cache = ListCache() - - @staticmethod - def _hook_fun(param_name, f): - def backward_hook(grad): - f(param_name, grad) - return backward_hook + self._param2name = defaultdict(str) def _rank_in_targets(self): if not hasattr(self, "_rank"): raise AttributeError("grad monitor need attribute {_rank}") return not torch.distributed.is_initialized() or data_in_list_target(getattr(self, "_rank"), self._target_ranks) - - def _model_backward_hook(self, module, gin, gout): - self._step += 1 - if not hasattr(self, "_rank"): - setattr(self, "_rank", get_rank_id(gout)) - print_info_log(f"rank_{self._rank} exists") - if not self._rank_in_targets(): - return - self._list_cache.flush() - if not data_in_list_target(self._step, self._target_step): - return - print_info_log(f"result generate: rank_{self._rank} step_{self._step}") - output_path = os.path.join(self._output_path, f"rank_{getattr(self, '_rank')}", f"grad_summary_{self._step}.csv") - write_csv(output_path, [], GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds)) - self._list_cache.set_output_file(output_path) - - def _save_grad_stat(self, param_name, grad): - if not self._rank_in_targets(): - return - if not data_in_list_target(self._step, self._target_step): - return - print_info_log(f"param result: rank{self._rank} step{self._step} {param_name}") - grad_info = GradStatCsv.generate_csv_line( - level=self._level_adp, - param_name=param_name, - grad=grad, - bounds=self._bounds) - self._list_cache.append(grad_info) - self._level_adp.save_grad_direction(param_name, grad, f'{self._output_path}/rank_{self._rank}/step_{self._step}') - def monitor(self, model): - last_module = None - for name, module in model.named_modules(): - last_module = module - last_module.register_backward_hook(self._model_backward_hook) - for param_name, param in model.named_parameters(): - if not data_in_list_target(param_name, self._param_list): - continue - if param is None or param.requires_grad == False: - continue - param.register_hook(GradientMonitor._hook_fun(param_name, self._save_grad_stat)) + def _hook_optimizer(self): + def optimizer_pre_step_hook(optimizer, args, kargs): + self._step += 1 + if not data_in_list_target(self._step, self._target_step): + return + output_lines = [] + for param, param_name in self._param2name.items(): + if not data_in_list_target(param_name, self._param_list): + continue + grad = param.main_grad if hasattr(param, "main_grad") else param.grad + grad_info = GradStatCsv.generate_csv_line( + level=self._level_adp, + param_name=param_name, + grad=grad, + bounds=self._bounds) + output_lines.append(grad_info) + self._level_adp.save_grad_direction(param_name, grad, f'{self._output_path}/rank_{self._rank}/step_{self._step}') + output_path = os.path.join(self._output_path, f"rank_{getattr(self, '_rank')}", f"grad_summary_{self._step}.csv") + write_csv(output_path, output_lines, GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds)) + register_optimizer_step_pre_hook(optimizer_pre_step_hook) - def save_grad(self, model): - self._step += 1 - if not hasattr(self, "_rank"): - setattr(self, "_rank", get_rank_id(next(model.parameters()))) - if not self._rank_in_targets(): - return - if not data_in_list_target(self._step, self._target_step): + def monitor(self, model): + print_rank_0("> parameter names:") + for name, param in model.named_parameters(): + self._param2name[param] = name + print_rank_0(f"\t{name}") + setattr(self, "_rank", get_rank_id()) + if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks): return - print_info_log(f"save grad rank_{getattr(self, '_rank')} step_{self._step}") - output_path = os.path.join(self._output_path, f"rank_{getattr(self, '_rank')}", f"grad_summary_{self._step}.csv") - write_csv(output_path, [], GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds)) - self._list_cache.set_output_file(output_path) - for param_name, param in model.named_parameters(): - if not data_in_list_target(param_name, self._param_list): - continue - if param.grad is not None: - grad = param.grad - elif hasattr(param, "main_grad") and param.main_grad is not None: - grad = param.main_grad - else: - continue - self._save_grad_stat(param_name, grad) - print_info_log(f"{param_name} is saved") - self._list_cache.flush() + self._hook_optimizer() diff --git a/debug/accuracy_tools/grad_tool/level_adapter.py b/debug/accuracy_tools/grad_tool/level_adapter.py index 64bb1e92a9272f2b071a9cc61094713ac7d84db2..e03bf2b110dc8041b356d83843e365e91236d32a 100644 --- a/debug/accuracy_tools/grad_tool/level_adapter.py +++ b/debug/accuracy_tools/grad_tool/level_adapter.py @@ -37,7 +37,6 @@ class LevelOps: param_grad = grad.clone().detach() is_positive = param_grad > 0 torch.save(is_positive, f'{save_path}/{param_name}.pt') - print_info_log(f'Save {param_name} bool tensor, it has {is_positive.sum()}/{is_positive.numel()} positive elements') class Level(ABC): diff --git a/debug/accuracy_tools/grad_tool/utils.py b/debug/accuracy_tools/grad_tool/utils.py index 8d0ed36ce30bdb502b26a6c5d7d0c1276ee8cc74..a7db58be29a9fbf55c04a8283cb68d09f9289796 100644 --- a/debug/accuracy_tools/grad_tool/utils.py +++ b/debug/accuracy_tools/grad_tool/utils.py @@ -1,6 +1,7 @@ import os import yaml import torch +import torch.distributed as dist import pandas as pd from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen, create_directory, \ FileChecker, FileCheckConst @@ -8,34 +9,6 @@ from ptdbg_ascend.src.python.ptdbg_ascend.common.utils import check_file_or_dire print_warn_log -class ListCache(list): - threshold = 1000 - - def __init__(self, *args): - self._dump_count = 0 - super().__init__(*args) - - def __del__(self): - self.flush() - - def flush(self): - if len(self) == 0: - return - if not self._output_file: - print_warn_log("dumpfile path is not setted") - write_csv(self._output_file, self, []) - print_info_log(f"write {len(self)} items to {self._output_file} the {self._dump_count} time") - self.clear() - - def append(self, data): - list.append(self, data) - if len(self) >= ListCache.threshold: - self.flush() - - def set_output_file(self, output_file): - self._output_file = output_file - - def get_config(filepath): with FileOpen(filepath, 'r') as file: config = yaml.safe_load(file) @@ -52,6 +25,7 @@ def write_csv(filepath, content_list, header): filepath_checker.common_check() new_data = pd.DataFrame(list(content for content in content_list)) new_data.to_csv(filepath, mode='a+', header=False, index=False) + print_info_log(f"write {len(content_list)} items to {filepath}") def make_file_safety(file_path: str, permission=0o640): @@ -83,28 +57,19 @@ def check_numeral_list_ascend(lst): raise Exception("The input list should be ascending") -def get_tensor_rank(x): - if isinstance(x, (list, tuple)): - if len(x) > 0: - return get_tensor_rank(x[0]) - return None - elif isinstance(x, torch.Tensor): - device = x.device - if device.type == 'cpu': - return None - else: - return device.index - return None - - -def get_rank_id(tensor): +def get_rank_id(): if torch.distributed.is_initialized(): return torch.distributed.get_rank() - rank = get_tensor_rank(tensor) - if rank is not None: - return rank return os.getpid() def path_check(path, isdir=False): check_file_or_directory_path(path, isdir) + + +def print_rank_0(message): + if dist.is_initialized(): + if dist.get_rank() == 0: + print(message) + else: + print(message)