diff --git a/debug/accuracy_tools/grad_tool/grad_comparator.py b/debug/accuracy_tools/grad_tool/grad_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..9cd3768e0c8ec24e48c28b2f06aeb2a09b0fba47 --- /dev/null +++ b/debug/accuracy_tools/grad_tool/grad_comparator.py @@ -0,0 +1,104 @@ +import os +import torch +from tqdm import tqdm +import matplotlib.pyplot as plt +from grad_tool.utils import write_csv, path_check + + +class GradComparator: + @staticmethod + def compare(path1: str, path2: str, output_dir): + steps = GradComparator._get_steps(path1, path2) + if len(steps) == 0: + raise Exception("no step for comparison") + similarities = {} + print(f"the following steps will be compared:\n{steps}") + for step in tqdm(steps, desc="culculate similarities"): + pt_files = GradComparator._get_pt_files(path1, path2, step) + same_count_summary = 0 + total_count_summary = 0 + for pt_file in pt_files: + pt1 = f'{path1}/step_{step}/{pt_file}' + pt2 = f'{path2}/step_{step}/{pt_file}' + same_count, total_count = GradComparator._calc_similarity(pt1, pt2) + same_count_summary += same_count + total_count_summary += total_count + if pt_file not in similarities: + similarities[pt_file] = [] + if total_count == 0: + similarities[pt_file].append(0) + else: + similarities[pt_file].append(same_count / total_count) + if "summary" not in similarities: + similarities["summary"] = [] + if total_count_summary == 0: + similarities["summary"].append(0) + else: + similarities["summary"].append(same_count_summary / total_count_summary) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + GradComparator._save_similar(similarities, steps, output_dir) + + @staticmethod + def _get_steps(path1: str, path2: str): + path_check(path1, isdir=True) + path_check(path2, isdir=True) + steps = [] + for folder1 in os.listdir(path1): + splits = folder1.split('_') + if len(splits) == 0 or splits[0] != 'step' or not splits[1].isdigit(): + continue + + folder2 = f'{path2}/{folder1}' + if not os.path.exists(folder2): + continue + steps.append(int(splits[1])) + steps = sorted(steps) + return steps + + @staticmethod + def _get_pt_files(path1: str, path2: str, step: int): + path1 = f'{path1}/step_{step}' + path2 = f'{path2}/step_{step}' + path_check(path1, isdir=True) + path_check(path2, isdir=True) + pt_files = [] + for folder1 in os.listdir(path1): + splits = folder1.split('.') + if len(splits) < 1 or splits[-1] != 'pt': + continue + folder2 = f'{path2}/{folder1}' + if not os.path.exists(folder2): + continue + pt_files.append(folder1) + return sorted(pt_files) + + @staticmethod + def _save_similar(similarities: [float], steps: [int], output_dir: str): + if len(similarities) == 0: + raise Exception(f"length of similarities is 0") + for key, value in tqdm(similarities.items(), desc="save similarities"): + if len(value) != len(steps): + raise Exception(f"similarities length of {key}:{len(value)} not equal steps:{len(steps)}") + plt.plot(steps, value) + plt.xlabel('steps') + plt.ylabel('similarities') + plt.title(f'{key}_similarities') + plt.savefig(f'{output_dir}/{key}_similarities.png') + plt.close() + head_tuple = tuple(['step'] + [str(step) for step in steps]) + write_csv(f"{output_dir}/{key}_similarities.csv", [['similarity'] + value], head_tuple) + + @staticmethod + def _calc_similarity(pt_file1: str, pt_file2: str): + tensor1 = torch.load(pt_file1) + tensor2 = torch.load(pt_file2) + if tensor1.shape != tensor2.shape: + raise Exception(f"tensor shape is not equal: {pt_file1}, {pt_file2}") + if tensor1.dtype != torch.bool: + raise Exception(f"tensor type is not bool: {pt_file1}") + if tensor2.dtype != torch.bool: + raise Exception(f"tensor type is not bool: {pt_file2}") + same_count = (tensor1 == tensor2).sum().item() + total_count = tensor1.numel() + return same_count, total_count \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/grad_monitor.py b/debug/accuracy_tools/grad_tool/grad_monitor.py index 1abbc63bc4cda47e488fe7409a82318fc14b54e7..2388784a29ab3f200aa437701315c9e925189367 100644 --- a/debug/accuracy_tools/grad_tool/grad_monitor.py +++ b/debug/accuracy_tools/grad_tool/grad_monitor.py @@ -4,7 +4,7 @@ import torch from grad_tool.level_adapter import Level, LevelAdapter from grad_tool.grad_stat_csv import GradStatCsv from grad_tool.utils import get_config, check_numeral_list_ascend, ListCache, data_in_list_target,\ - write_csv, make_localtime_dir, get_rank_id + write_csv, make_localtime_dir, get_rank_id, print_debug_log class GradientMonitor: @@ -15,7 +15,9 @@ class GradientMonitor: self._level_adp: Level = LevelAdapter.level_adapter(config.get("level")) self._param_list = config.get('param_list') self._target_ranks = config.get("rank") + print_debug_log(f"target rank {self._target_ranks}") self._target_step = config.get("step") + print_debug_log(f"target step {self._target_step}") self._bounds = config.get("bounds") if not self._bounds or len(self._bounds) == 0: self._bounds = GradientMonitor.default_bounds @@ -33,12 +35,14 @@ class GradientMonitor: def model_backward_hook(self, module, gin, gout): if not hasattr(self, "_rank"): setattr(self, "_rank", get_rank_id(gout)) + print_debug_log(f"rank{self._rank} exists") if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks): return self._list_cache.flush() self._step += 1 if not data_in_list_target(self._step, self._target_step): return + print_debug_log(f"result generate: rank{self._rank} step{self._step}") output_path = f'{self._output_path}/rank_{self._rank}/grad_summary_{self._step}.csv' write_csv(output_path, [], GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds)) self._list_cache.set_output_file(output_path) @@ -50,6 +54,7 @@ class GradientMonitor: return if not data_in_list_target(self._step, self._target_step): return + print_debug_log(f"param result: rank{self._rank} step{self._step} {param_name}") grad_info = GradStatCsv.generate_csv_line( level=self._level_adp, param_name=param_name, @@ -60,8 +65,39 @@ class GradientMonitor: def monitor(self, model): - model.register_full_backward_hook(self.model_backward_hook) + last_module = None + for name, module in model.named_modules(): + last_module = module + last_module.register_backward_hook(self.model_backward_hook) for param_name, param in model.named_parameters(): if not data_in_list_target(param_name, self._param_list): continue - param.register_hook(GradientMonitor.hook_fun(param_name, self.save_grad_stat)) \ No newline at end of file + if param is None or param.requires_grad == False: + continue + param.register_hook(GradientMonitor.hook_fun(param_name, self.save_grad_stat)) + + + def save_grad(self, model): + self._step += 1 + if not data_in_list_target(self._step, self._target_step): + return + print_debug_log(f"save grad step{self._step}") + if hasattr(self, "_rank"): + # raise AttributeError("grad monitor need attribute {_rank} when save grad stat") + output_path = f'{self._output_path}/rank_{self._rank}/grad_summary{self._step}.csv' + write_csv(output_path, [], GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds)) + self._list_cache.set_output_file(output_path) + for param_name, param in model.named_parameters(): + if not hasattr(self, "_rank"): + # raise AttributeError("grad monitor need attribute {_rank} when save grad stat") + setattr(self, "_rank", get_rank_id(param)) + output_path = f'{self._output_path}/rank_{self._rank}/grad_summary{self._step}.csv' + write_csv(output_path, [], GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds)) + self._list_cache.set_output_file(output_path) + if not data_in_list_target(param_name, self._param_list): + continue + if param.grad is None: + continue + self.save_grad_stat(param_name, param.grad) + print_debug_log(f"{param_name} is saved") + self._list_cache.flush() \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/utils.py b/debug/accuracy_tools/grad_tool/utils.py index 4e563d9928112bb95017ba7b8ec7bd281cb0832d..e9367d2b4df95ed7e3df0c29ddbd1f38fe70a87a 100644 --- a/debug/accuracy_tools/grad_tool/utils.py +++ b/debug/accuracy_tools/grad_tool/utils.py @@ -5,7 +5,7 @@ import torch import pandas as pd from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen, create_directory, \ check_link, FileChecker, FileCheckConst - +from ptdbg_ascend.src.python.ptdbg_ascend.common.utils import check_file_or_directory_path class ListCache(list): threshold = 1000 @@ -115,3 +115,9 @@ def get_rank_id(tensor): if rank is not None: return rank return os.getpid() + +def print_debug_log(s): + print(f"gradient debug: {s}") + +def path_check(path, isdir=False): + check_file_or_directory_path(path, isdir) \ No newline at end of file