From 67a2fd892fe7e5f937e01257e74c20247c19976e Mon Sep 17 00:00:00 2001 From: pengxiaopeng Date: Thu, 18 Jan 2024 15:23:35 +0800 Subject: [PATCH] grad tool --- debug/accuracy_tools/grad_tool/__init__.py | 0 .../accuracy_tools/grad_tool/grad_monitor.py | 67 ++++++++++ .../accuracy_tools/grad_tool/grad_stat_csv.py | 105 ++++++++++++++++ .../accuracy_tools/grad_tool/level_adapter.py | 92 ++++++++++++++ debug/accuracy_tools/grad_tool/utils.py | 117 ++++++++++++++++++ 5 files changed, 381 insertions(+) create mode 100644 debug/accuracy_tools/grad_tool/__init__.py create mode 100644 debug/accuracy_tools/grad_tool/grad_monitor.py create mode 100644 debug/accuracy_tools/grad_tool/grad_stat_csv.py create mode 100644 debug/accuracy_tools/grad_tool/level_adapter.py create mode 100644 debug/accuracy_tools/grad_tool/utils.py diff --git a/debug/accuracy_tools/grad_tool/__init__.py b/debug/accuracy_tools/grad_tool/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/grad_tool/grad_monitor.py b/debug/accuracy_tools/grad_tool/grad_monitor.py new file mode 100644 index 0000000000..1abbc63bc4 --- /dev/null +++ b/debug/accuracy_tools/grad_tool/grad_monitor.py @@ -0,0 +1,67 @@ +import os +import torch + +from grad_tool.level_adapter import Level, LevelAdapter +from grad_tool.grad_stat_csv import GradStatCsv +from grad_tool.utils import get_config, check_numeral_list_ascend, ListCache, data_in_list_target,\ + write_csv, make_localtime_dir, get_rank_id + + +class GradientMonitor: + default_bounds = [-10, -1, -0.1, -0.01, -0.001, 0, 0.001, 0.01, 0.1, 1, 10] + + def __init__(self, config_filepath): + config = get_config(config_filepath) + self._level_adp: Level = LevelAdapter.level_adapter(config.get("level")) + self._param_list = config.get('param_list') + self._target_ranks = config.get("rank") + self._target_step = config.get("step") + self._bounds = config.get("bounds") + if not self._bounds or len(self._bounds) == 0: + self._bounds = GradientMonitor.default_bounds + check_numeral_list_ascend(self._bounds) + self._output_path = make_localtime_dir(config.get("output_path")) + self._step = -1 + self._list_cache = ListCache() + + @staticmethod + def hook_fun(param_name, f): + def backward_hook(grad): + f(param_name, grad) + return backward_hook + + def model_backward_hook(self, module, gin, gout): + if not hasattr(self, "_rank"): + setattr(self, "_rank", get_rank_id(gout)) + if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks): + return + self._list_cache.flush() + self._step += 1 + if not data_in_list_target(self._step, self._target_step): + return + output_path = f'{self._output_path}/rank_{self._rank}/grad_summary_{self._step}.csv' + write_csv(output_path, [], GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds)) + self._list_cache.set_output_file(output_path) + + def save_grad_stat(self, param_name, grad): + if not hasattr(self, "_rank"): + raise AttributeError("grad monitor need attribute {_rank} when save grad stat") + if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks): + return + if not data_in_list_target(self._step, self._target_step): + return + grad_info = GradStatCsv.generate_csv_line( + level=self._level_adp, + param_name=param_name, + grad=grad, + bounds=self._bounds) + self._list_cache.append(grad_info) + self._level_adp.save_grad_direction(param_name, grad, f'{self._output_path}/rank_{self._rank}/step_{self._step}') + + + def monitor(self, model): + model.register_full_backward_hook(self.model_backward_hook) + for param_name, param in model.named_parameters(): + if not data_in_list_target(param_name, self._param_list): + continue + param.register_hook(GradientMonitor.hook_fun(param_name, self.save_grad_stat)) \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/grad_stat_csv.py b/debug/accuracy_tools/grad_tool/grad_stat_csv.py new file mode 100644 index 0000000000..76a341c1fc --- /dev/null +++ b/debug/accuracy_tools/grad_tool/grad_stat_csv.py @@ -0,0 +1,105 @@ +import hashlib +import torch +from grad_tool.level_adapter import Level + + +class GradExtremeOps: + @staticmethod + def tensor_max(tensor): + return torch._C._VariableFunctionsClass.max(tensor).cpu().detach().float().numpy().tolist() + + @staticmethod + def tensor_min(tensor): + return torch._C._VariableFunctionsClass.min(tensor).cpu().detach().float().numpy().tolist() + + @staticmethod + def tensor_norm(tensor): + return torch._C._VariableFunctionsClass.norm(tensor).cpu().detach().float().numpy().tolist() + + +class GradExtremes: + extremes = { + "max": GradExtremeOps.tensor_max, + "min": GradExtremeOps.tensor_min, + "norm": GradExtremeOps.tensor_norm + } + + +class GradStatOps: + @staticmethod + def md5_header(**kwargs): + return ["MD5"] + + @staticmethod + def intervals_header(**kwargs): + level: Level = kwargs.get("level") + bounds = kwargs.get("bounds") + return level.intervals_header(bounds) + + @staticmethod + def extremes_header(**kwargs): + return GradExtremes.extremes.keys() + + @staticmethod + def shape_header(**kwargs): + return ["shape"] + + @staticmethod + def md5_content(**kwargs): + grad = kwargs.get("grad") + tensor_bytes = grad.cpu().detach().float().numpy().tobytes() + md5_hash = hashlib.md5(tensor_bytes) + return [md5_hash.hexdigest()] + + @staticmethod + def count_distribution(**kwargs): + level: Level = kwargs.get("level") + grad = kwargs.get("grad") + bounds = kwargs.get("bounds") + return level.count_grad_distribution(grad, bounds) + + @staticmethod + def extremes_content(**kwargs): + grad = kwargs.get("grad") + return [f(grad) for f in GradExtremes.extremes.values()] + + @staticmethod + def shape_content(**kwargs): + grad = kwargs.get("grad") + return [list(grad.shape)] + + +class GradStatCsv: + CSV = { + "MD5": { + "header": GradStatOps.md5_header, + "content": GradStatOps.md5_content + }, + "distribution": { + "header": GradStatOps.intervals_header, + "content": GradStatOps.count_distribution + }, + "extremes": { + "header": GradStatOps.extremes_header, + "content": GradStatOps.extremes_content + }, + "shape": { + "header": GradStatOps.shape_header, + "content": GradStatOps.shape_content + }, + } + + @staticmethod + def generate_csv_header(**kwargs): + header = ["param_name"] + for func in GradStatCsv.CSV.values(): + header.extend(func["header"](**kwargs)) + return header + + @staticmethod + def generate_csv_line(**kwargs): + line = [kwargs.get("param_name")] + for func in GradStatCsv.CSV.values(): + line.extend(func["content"](**kwargs)) + return line + \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/level_adapter.py b/debug/accuracy_tools/grad_tool/level_adapter.py new file mode 100644 index 0000000000..d27a79e286 --- /dev/null +++ b/debug/accuracy_tools/grad_tool/level_adapter.py @@ -0,0 +1,92 @@ +import os +from abc import ABC, abstractmethod +import torch + + +class LevelOps: + @staticmethod + def intervals_header(bounds): + intervals = [] + for i, _ in enumerate(bounds): + if i == 0: + intervals.append(f"(-inf, {bounds[i]}]") + else: + intervals.append(f"({bounds[i-1]}, {bounds[i]}]") + intervals.extend([f"({bounds[-1]}, inf)", "=0"]) + return intervals + + @staticmethod + def count_grad_distribution(grad, bounds): + grad = grad.cpu().detach() + if grad.dtype == torch.bfloat16: + grad = grad.to(torch.float32) + element_num = grad.numel() + grad_equal_0_num = (grad == 0).sum().item() + bound = torch.Tensor(bounds) + bucketsize_result = torch.bucketize(grad, bound) + interval_nums = [(bucketsize_result == i).sum().item() for i in range(len(bound) + 1)] + interval_nums.append(grad_equal_0_num) + return_list = [x / element_num if element_num != 0 else 0 for x in interval_nums] + return return_list + + +class Level(ABC): + @abstractmethod + def save_grad_direction(self, param_name, grad, save_path): + pass + + @abstractmethod + def count_grad_distribution(self, grad, bounds) -> list: + pass + + @abstractmethod + def intervals_header(self, bounds) -> list: + pass + + +class Level_0(Level): + def save_grad_direction(self, param_name, grad, save_path): + pass + + def count_grad_distribution(self, grad, bounds): + return [] + + def intervals_header(self, bounds): + return [] + + +class Level_1(Level): + def save_grad_direction(self, param_name, grad, save_path): + pass + + def count_grad_distribution(self, grad, bounds): + return LevelOps.count_grad_distribution(grad, bounds) + + def intervals_header(self, bounds): + return LevelOps.intervals_header(bounds) + + +class Level_2(Level): + def save_grad_direction(self, param_name, grad, save_path): + if not os.path.exists(save_path): + os.makedirs(save_path) + param_grad = torch.Tensor(grad.clone().cpu()) + is_positive = param_grad > 0 + torch.save(is_positive, f'{save_path}/{param_name}.pt') + print(f'Save {param_name} bool tensor, it has {is_positive.sum()}/{is_positive.numel()} positive elements') + + def count_grad_distribution(self, grad, bounds): + return LevelOps.count_grad_distribution(grad, bounds) + + def intervals_header(self, bounds): + return LevelOps.intervals_header(bounds) + + +class LevelAdapter: + levels = {"L0": Level_0, "L1": Level_1, "L2": Level_2} + + @staticmethod + def level_adapter(level): + if level not in LevelAdapter.levels: + raise Exception(f"level is valid, not in {LevelAdapter.levels.keys()}") + return LevelAdapter.levels[level]() diff --git a/debug/accuracy_tools/grad_tool/utils.py b/debug/accuracy_tools/grad_tool/utils.py new file mode 100644 index 0000000000..7ba23d7aac --- /dev/null +++ b/debug/accuracy_tools/grad_tool/utils.py @@ -0,0 +1,117 @@ +import os +import time +import yaml +import torch +import pandas as pd +from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen, create_directory, \ + check_link, FileChecker, FileCheckConst + + +class ListCache(list): + threshold = 1000 + + def __init__(self, *args): + self._dump_count = 0 + super().__init__(*args) + + def __del__(self): + self.flush() + + def flush(self): + if len(self) == 0: + return + if not self._output_file: + print("dumpfile path is not setted") + write_csv(self._output_file, self, []) + print(f"write {len(self)} items to {self._output_file} the {self._dump_count} time") + self.clear() + + def append(self, data): + list.append(self, data) + if len(self) >= ListCache.threshold: + self.flush() + + def set_output_file(self, output_file): + self._output_file = output_file + + +def get_config(filepath): + with FileOpen(filepath, 'r') as file: + config = yaml.safe_load(file) + return config + + +def write_csv(filepath, content_list, header): + if not os.path.exists(filepath): + make_file_safety(filepath) + data_frame = pd.DataFrame(columns=header) + data_frame.to_csv(filepath, index=False) + + filepath_checker = FileChecker(filepath, FileCheckConst.FILE) + filepath_checker.common_check() + new_data = pd.DataFrame(list(content for content in content_list)) + new_data.to_csv(filepath, mode='a+', header=False, index=False) + + +def make_file_safety(file_path: str, permission=0o640): + if os.path.islink(file_path): + raise RuntimeError("Invalid soft link path: {}".format(file_path)) + file_real_path = os.path.realpath(file_path) + if os.path.exists(file_real_path): + return + parent_path = os.path.dirname(file_real_path) + if not os.path.exists(parent_path): + create_directory(parent_path) + if not os.access(parent_path, os.W_OK): + raise PermissionError("The path {} is not writable!".format(parent_path)) + try: + os.close(os.open(file_real_path, os.O_WRONLY | os.O_CREAT, permission)) + except OSError as e: + raise RuntimeError("Can't create file: " + file_real_path) from e + os.chmod(file_real_path, permission) + + +def data_in_list_target(data, lst): + return not lst or len(lst) == 0 or data in lst + + +def check_numeral_list_ascend(lst): + if any(not isinstance(item, (int, float)) for item in lst): + raise Exception("The input list should only contain numbers") + if lst != sorted(lst): + raise Exception("The input list should be ascending") + + +def localtime_str(): + return time.strftime("%Y%m%d%H%M%S", time.localtime()) + + +def make_localtime_dir(path): + if not os.path.isdir(path): + create_directory(path) + localtime_dir = os.path.join(path, localtime_str()) + create_directory(localtime_dir) + return localtime_dir + + +def get_tensor_rank(x): + if isinstance(x, (list, tuple)): + if len(x) > 0: + return get_tensor_rank(x[0]) + return None + elif isinstance(x, torch.Tensor): + device = x.device + if device.type == 'cpu': + return None + else: + return device.index + return None + + +def get_rank_id(tensor): + if torch.distributed.is_initialized(): + return torch.distributed.get_rank() + rank = get_tensor_rank(tensor) + if rank: + return rank + return os.getpid() -- Gitee