diff --git a/debug/accuracy_tools/grad_tool/common/utils.py b/debug/accuracy_tools/grad_tool/common/utils.py index cdce3fda7e38940c293849b63f139ce96c5d1d55..2927f2b5d1fb600d2e5af141a843c35fefc0d8ea 100644 --- a/debug/accuracy_tools/grad_tool/common/utils.py +++ b/debug/accuracy_tools/grad_tool/common/utils.py @@ -2,6 +2,7 @@ import os import re import sys import time +import shutil import yaml import pandas as pd @@ -220,3 +221,15 @@ def change_mode(path, mode): except PermissionError as ex: print_error_log(f'Failed to change {path} authority. {str(ex)}') raise ex + +def remove_path(path): + if not os.path.exists(path): + return + try: + if os.path.islink(path) or os.path.isfile(path): + os.remove(path) + else: + shutil.rmtree(path) + except PermissionError as err: + print_error_log("Failed to delete {}. Please check the permission.".format(path)) + raise err \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/grad_pt/grad_monitor.py b/debug/accuracy_tools/grad_tool/grad_pt/grad_monitor.py index f3079e622c29eefe20a6e1fdc9d372002b596610..eb7b3a896dfe5269c3b14673389285b4474cfad9 100644 --- a/debug/accuracy_tools/grad_tool/grad_pt/grad_monitor.py +++ b/debug/accuracy_tools/grad_tool/grad_pt/grad_monitor.py @@ -6,7 +6,7 @@ from torch.optim.optimizer import register_optimizer_step_pre_hook from grad_tool.common.base_monitor import BaseMonitor from grad_tool.grad_pt.grad_stat_csv import GradStatCsv from grad_tool.common.utils import check_numeral_list_ascend, data_in_list_target, \ - write_csv, print_info_log, create_directory, print_warn_log, change_mode + write_csv, print_info_log, create_directory, print_warn_log, change_mode, remove_path from grad_tool.grad_pt.utils import get_rank_id, print_rank_0, GradConst @@ -49,6 +49,8 @@ class PtGradientMonitor(BaseMonitor): print_warn_log(f"the file in {self._output_path} will be recoverd") self._step = -1 self._param2name = defaultdict(str) + self._exec_order = [] + self._recording_exec_order = True @property def output_path(self): @@ -65,10 +67,18 @@ class PtGradientMonitor(BaseMonitor): change_mode(save_filepath, 0o640) def monitor(self, model): + + def tensor_hook(name): + def record_order(grad): + if self._recording_exec_order: + self._exec_order.append(name) + return record_order + print_rank_0("> parameter names:") for name, param in model.named_parameters(): self._param2name[param] = name print_rank_0(f"\t{name}") + param.register_hook(tensor_hook(name)) setattr(self, "_rank", get_rank_id()) if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks): return @@ -79,8 +89,13 @@ class PtGradientMonitor(BaseMonitor): raise AttributeError("grad monitor need attribute {_rank}") return not torch.distributed.is_initialized() or data_in_list_target(getattr(self, "_rank"), self._target_ranks) + def _output_sort_by_exec_order(self, output_lines): + dict_by_name = {x[0]: x for x in output_lines} + return [dict_by_name[i] for i in self._exec_order] + def _hook_optimizer(self): def optimizer_pre_step_hook(optimizer, args, kargs): + self._recording_exec_order = False self._step += 1 if not data_in_list_target(self._step, self._target_step): return @@ -97,8 +112,16 @@ class PtGradientMonitor(BaseMonitor): if self._level_adp["have_grad_direction"]: PtGradientMonitor.save_grad_direction(param_name, grad, f'{self._output_path}/rank{self._rank}/step{self._step}') + + if self._exec_order: + if len(self._exec_order) != len(output_lines): + print_warn_log(f"grad_number in tensor_hook not equal to grad_number in optimizer_hook: \ + {len(self._exec_order)} vs {len(output_lines)}. maybe something wrong happend.") + else: + output_lines = self._output_sort_by_exec_order(output_lines) output_path = os.path.join(self._output_path, f"rank{getattr(self, '_rank')}", f"grad_summary_{self._step}.csv") + remove_path(output_path) write_csv(output_path, output_lines, GradStatCsv.generate_csv_header(self._level_adp, self._bounds)) diff --git a/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py b/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py index 77a51e0941fa80edd53c2814c9c1a9252001b9ae..2121db7cc586074984f1dd6af17a7379c440e3ec 100644 --- a/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py +++ b/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py @@ -64,4 +64,4 @@ class TestGradMonitor(unittest.TestCase): self.assertEqual(len(items), 1) with open(os.path.join(compare_output_path, items[0], "similarities.csv"), 'r') as f: data = f.read() - self.assertEqual(hashlib.md5(data.encode("utf-8")).hexdigest(), "0e2bd636b48245d387647523c8517982") + self.assertEqual(hashlib.md5(data.encode("utf-8")).hexdigest(), "a968d0cc1c6e289c00ff4fd5811811ca")