From 67a2fd892fe7e5f937e01257e74c20247c19976e Mon Sep 17 00:00:00 2001
From: pengxiaopeng <pengxiaopeng1@huawei.com>
Date: Thu, 18 Jan 2024 15:23:35 +0800
Subject: [PATCH] grad tool

---
 debug/accuracy_tools/grad_tool/__init__.py    |   0
 .../accuracy_tools/grad_tool/grad_monitor.py  |  67 ++++++++++
 .../accuracy_tools/grad_tool/grad_stat_csv.py | 105 ++++++++++++++++
 .../accuracy_tools/grad_tool/level_adapter.py |  92 ++++++++++++++
 debug/accuracy_tools/grad_tool/utils.py       | 117 ++++++++++++++++++
 5 files changed, 381 insertions(+)
 create mode 100644 debug/accuracy_tools/grad_tool/__init__.py
 create mode 100644 debug/accuracy_tools/grad_tool/grad_monitor.py
 create mode 100644 debug/accuracy_tools/grad_tool/grad_stat_csv.py
 create mode 100644 debug/accuracy_tools/grad_tool/level_adapter.py
 create mode 100644 debug/accuracy_tools/grad_tool/utils.py

diff --git a/debug/accuracy_tools/grad_tool/__init__.py b/debug/accuracy_tools/grad_tool/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/debug/accuracy_tools/grad_tool/grad_monitor.py b/debug/accuracy_tools/grad_tool/grad_monitor.py
new file mode 100644
index 0000000000..1abbc63bc4
--- /dev/null
+++ b/debug/accuracy_tools/grad_tool/grad_monitor.py
@@ -0,0 +1,67 @@
+import os
+import torch
+
+from grad_tool.level_adapter import Level, LevelAdapter
+from grad_tool.grad_stat_csv import GradStatCsv
+from grad_tool.utils import get_config, check_numeral_list_ascend, ListCache, data_in_list_target,\
+    write_csv, make_localtime_dir, get_rank_id
+
+
+class GradientMonitor:
+    default_bounds = [-10, -1, -0.1, -0.01, -0.001, 0, 0.001, 0.01, 0.1, 1, 10]
+
+    def __init__(self, config_filepath):
+        config = get_config(config_filepath)
+        self._level_adp: Level = LevelAdapter.level_adapter(config.get("level"))
+        self._param_list = config.get('param_list')
+        self._target_ranks = config.get("rank")
+        self._target_step = config.get("step")
+        self._bounds = config.get("bounds")
+        if not self._bounds or len(self._bounds) == 0:
+            self._bounds = GradientMonitor.default_bounds
+        check_numeral_list_ascend(self._bounds)
+        self._output_path = make_localtime_dir(config.get("output_path"))
+        self._step = -1
+        self._list_cache = ListCache()
+    
+    @staticmethod
+    def hook_fun(param_name, f):
+        def backward_hook(grad):
+            f(param_name, grad)
+        return backward_hook
+    
+    def model_backward_hook(self, module, gin, gout):
+        if not hasattr(self, "_rank"):
+            setattr(self, "_rank", get_rank_id(gout))
+        if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks):
+            return
+        self._list_cache.flush()
+        self._step += 1
+        if not data_in_list_target(self._step, self._target_step):
+            return
+        output_path = f'{self._output_path}/rank_{self._rank}/grad_summary_{self._step}.csv'
+        write_csv(output_path, [], GradStatCsv.generate_csv_header(level=self._level_adp, bounds=self._bounds))
+        self._list_cache.set_output_file(output_path)
+        
+    def save_grad_stat(self, param_name, grad):
+        if not hasattr(self, "_rank"):
+            raise AttributeError("grad monitor need attribute {_rank} when save grad stat")
+        if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks):
+            return
+        if not data_in_list_target(self._step, self._target_step):
+            return
+        grad_info = GradStatCsv.generate_csv_line(
+            level=self._level_adp, 
+            param_name=param_name, 
+            grad=grad,
+            bounds=self._bounds)
+        self._list_cache.append(grad_info)
+        self._level_adp.save_grad_direction(param_name, grad, f'{self._output_path}/rank_{self._rank}/step_{self._step}')
+
+
+    def monitor(self, model):
+        model.register_full_backward_hook(self.model_backward_hook)
+        for param_name, param in model.named_parameters():
+            if not data_in_list_target(param_name, self._param_list):
+                continue
+            param.register_hook(GradientMonitor.hook_fun(param_name, self.save_grad_stat))
\ No newline at end of file
diff --git a/debug/accuracy_tools/grad_tool/grad_stat_csv.py b/debug/accuracy_tools/grad_tool/grad_stat_csv.py
new file mode 100644
index 0000000000..76a341c1fc
--- /dev/null
+++ b/debug/accuracy_tools/grad_tool/grad_stat_csv.py
@@ -0,0 +1,105 @@
+import hashlib
+import torch
+from grad_tool.level_adapter import Level
+
+
+class GradExtremeOps:
+    @staticmethod
+    def tensor_max(tensor):
+        return torch._C._VariableFunctionsClass.max(tensor).cpu().detach().float().numpy().tolist()
+    
+    @staticmethod
+    def tensor_min(tensor):
+        return torch._C._VariableFunctionsClass.min(tensor).cpu().detach().float().numpy().tolist()
+    
+    @staticmethod
+    def tensor_norm(tensor):
+        return torch._C._VariableFunctionsClass.norm(tensor).cpu().detach().float().numpy().tolist()
+
+
+class GradExtremes:
+    extremes = {
+        "max": GradExtremeOps.tensor_max,
+        "min": GradExtremeOps.tensor_min,
+        "norm": GradExtremeOps.tensor_norm
+    }
+
+
+class GradStatOps:
+    @staticmethod
+    def md5_header(**kwargs):
+        return ["MD5"]
+    
+    @staticmethod
+    def intervals_header(**kwargs):
+        level: Level = kwargs.get("level")
+        bounds = kwargs.get("bounds")
+        return level.intervals_header(bounds)
+
+    @staticmethod
+    def extremes_header(**kwargs):
+        return GradExtremes.extremes.keys()
+
+    @staticmethod
+    def shape_header(**kwargs):
+        return ["shape"]
+    
+    @staticmethod
+    def md5_content(**kwargs):
+        grad = kwargs.get("grad")
+        tensor_bytes = grad.cpu().detach().float().numpy().tobytes()
+        md5_hash = hashlib.md5(tensor_bytes)
+        return [md5_hash.hexdigest()]
+
+    @staticmethod
+    def count_distribution(**kwargs):
+        level: Level = kwargs.get("level")
+        grad = kwargs.get("grad")
+        bounds = kwargs.get("bounds")
+        return level.count_grad_distribution(grad, bounds)
+
+    @staticmethod
+    def extremes_content(**kwargs):
+        grad = kwargs.get("grad")
+        return [f(grad) for f in GradExtremes.extremes.values()]
+    
+    @staticmethod
+    def shape_content(**kwargs):
+        grad = kwargs.get("grad")
+        return [list(grad.shape)]
+
+
+class GradStatCsv:
+    CSV = {
+            "MD5": {
+                "header": GradStatOps.md5_header,
+                "content": GradStatOps.md5_content
+            },
+            "distribution": {
+                "header": GradStatOps.intervals_header,
+                "content": GradStatOps.count_distribution
+            },
+            "extremes": {
+                "header": GradStatOps.extremes_header,
+                "content": GradStatOps.extremes_content
+            },
+            "shape": {
+                "header": GradStatOps.shape_header,
+                "content": GradStatOps.shape_content
+            },
+        }
+   
+    @staticmethod
+    def generate_csv_header(**kwargs):
+        header = ["param_name"]
+        for func in GradStatCsv.CSV.values():
+            header.extend(func["header"](**kwargs))
+        return header
+
+    @staticmethod
+    def generate_csv_line(**kwargs):
+        line = [kwargs.get("param_name")]
+        for func in GradStatCsv.CSV.values():
+            line.extend(func["content"](**kwargs))
+        return line
+    
\ No newline at end of file
diff --git a/debug/accuracy_tools/grad_tool/level_adapter.py b/debug/accuracy_tools/grad_tool/level_adapter.py
new file mode 100644
index 0000000000..d27a79e286
--- /dev/null
+++ b/debug/accuracy_tools/grad_tool/level_adapter.py
@@ -0,0 +1,92 @@
+import os
+from abc import ABC, abstractmethod
+import torch
+
+
+class LevelOps:
+    @staticmethod
+    def intervals_header(bounds):
+        intervals = []
+        for i, _ in enumerate(bounds):
+            if i == 0:
+                intervals.append(f"(-inf, {bounds[i]}]")
+            else:
+                intervals.append(f"({bounds[i-1]}, {bounds[i]}]")
+        intervals.extend([f"({bounds[-1]}, inf)", "=0"])
+        return intervals 
+
+    @staticmethod
+    def count_grad_distribution(grad, bounds):
+        grad = grad.cpu().detach()
+        if grad.dtype == torch.bfloat16:
+            grad = grad.to(torch.float32)
+        element_num = grad.numel()
+        grad_equal_0_num = (grad == 0).sum().item()
+        bound = torch.Tensor(bounds)
+        bucketsize_result = torch.bucketize(grad, bound)
+        interval_nums = [(bucketsize_result == i).sum().item() for i in range(len(bound) + 1)]
+        interval_nums.append(grad_equal_0_num)
+        return_list = [x / element_num if element_num != 0 else 0 for x in interval_nums]
+        return return_list
+    
+
+class Level(ABC):
+    @abstractmethod
+    def save_grad_direction(self, param_name, grad, save_path):
+        pass
+
+    @abstractmethod
+    def count_grad_distribution(self, grad, bounds) -> list:
+        pass
+
+    @abstractmethod
+    def intervals_header(self, bounds) -> list:
+        pass
+
+
+class Level_0(Level):
+    def save_grad_direction(self, param_name, grad, save_path):
+        pass
+
+    def count_grad_distribution(self, grad, bounds):
+        return []
+
+    def intervals_header(self, bounds):
+        return []
+
+
+class Level_1(Level):
+    def save_grad_direction(self, param_name, grad, save_path):
+        pass
+
+    def count_grad_distribution(self, grad, bounds):
+        return LevelOps.count_grad_distribution(grad, bounds)
+
+    def intervals_header(self, bounds):
+        return LevelOps.intervals_header(bounds)
+
+
+class Level_2(Level):
+    def save_grad_direction(self, param_name, grad, save_path):
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        param_grad = torch.Tensor(grad.clone().cpu())
+        is_positive = param_grad > 0
+        torch.save(is_positive, f'{save_path}/{param_name}.pt')
+        print(f'Save {param_name} bool tensor, it has {is_positive.sum()}/{is_positive.numel()} positive elements')
+
+    def count_grad_distribution(self, grad, bounds):
+        return LevelOps.count_grad_distribution(grad, bounds)
+
+    def intervals_header(self, bounds):
+        return LevelOps.intervals_header(bounds)
+
+
+class LevelAdapter:
+    levels = {"L0": Level_0, "L1": Level_1, "L2": Level_2}
+
+    @staticmethod
+    def level_adapter(level):
+        if level not in LevelAdapter.levels:
+            raise Exception(f"level is valid, not in {LevelAdapter.levels.keys()}")
+        return LevelAdapter.levels[level]()
diff --git a/debug/accuracy_tools/grad_tool/utils.py b/debug/accuracy_tools/grad_tool/utils.py
new file mode 100644
index 0000000000..7ba23d7aac
--- /dev/null
+++ b/debug/accuracy_tools/grad_tool/utils.py
@@ -0,0 +1,117 @@
+import os
+import time
+import yaml
+import torch
+import pandas as pd
+from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen, create_directory, \
+    check_link, FileChecker, FileCheckConst
+
+
+class ListCache(list):
+    threshold = 1000
+
+    def __init__(self, *args):
+        self._dump_count = 0
+        super().__init__(*args)
+
+    def __del__(self):
+        self.flush()
+
+    def flush(self):
+        if len(self) == 0:
+            return
+        if not self._output_file:
+            print("dumpfile path is not setted")
+        write_csv(self._output_file, self, [])
+        print(f"write {len(self)} items to {self._output_file} the {self._dump_count} time")
+        self.clear()
+
+    def append(self, data):
+        list.append(self, data)
+        if len(self) >= ListCache.threshold:
+            self.flush()
+    
+    def set_output_file(self, output_file):
+        self._output_file = output_file
+
+
+def get_config(filepath):
+    with FileOpen(filepath, 'r') as file:
+        config = yaml.safe_load(file)
+    return config
+
+
+def write_csv(filepath, content_list, header):
+    if not os.path.exists(filepath):
+        make_file_safety(filepath)
+        data_frame = pd.DataFrame(columns=header)
+        data_frame.to_csv(filepath, index=False)
+
+    filepath_checker = FileChecker(filepath, FileCheckConst.FILE)
+    filepath_checker.common_check()
+    new_data = pd.DataFrame(list(content for content in content_list))
+    new_data.to_csv(filepath, mode='a+', header=False, index=False)
+
+
+def make_file_safety(file_path: str, permission=0o640):
+    if os.path.islink(file_path):
+        raise RuntimeError("Invalid soft link path: {}".format(file_path))
+    file_real_path = os.path.realpath(file_path)
+    if os.path.exists(file_real_path):
+        return
+    parent_path = os.path.dirname(file_real_path)
+    if not os.path.exists(parent_path):
+        create_directory(parent_path)
+    if not os.access(parent_path, os.W_OK):
+        raise PermissionError("The path {} is not writable!".format(parent_path))
+    try:
+        os.close(os.open(file_real_path, os.O_WRONLY | os.O_CREAT, permission))
+    except OSError as e:
+        raise RuntimeError("Can't create file: " + file_real_path) from e
+    os.chmod(file_real_path, permission)
+
+
+def data_in_list_target(data, lst):
+    return not lst or len(lst) == 0 or data in lst
+
+
+def check_numeral_list_ascend(lst):
+    if any(not isinstance(item, (int, float)) for item in lst):
+        raise Exception("The input list should only contain numbers")
+    if lst != sorted(lst):
+        raise Exception("The input list should be ascending")
+
+
+def localtime_str():
+    return time.strftime("%Y%m%d%H%M%S", time.localtime())
+
+
+def make_localtime_dir(path):
+    if not os.path.isdir(path):
+        create_directory(path)
+    localtime_dir = os.path.join(path, localtime_str())
+    create_directory(localtime_dir)
+    return localtime_dir
+
+
+def get_tensor_rank(x):
+    if isinstance(x, (list, tuple)):
+        if len(x) > 0:
+            return get_tensor_rank(x[0])
+        return None
+    elif isinstance(x, torch.Tensor):
+        device = x.device
+        if device.type == 'cpu':
+            return None
+        else:
+            return device.index
+    return None
+
+
+def get_rank_id(tensor):
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    rank = get_tensor_rank(tensor)
+    if rank:
+        return rank
+    return os.getpid()
-- 
Gitee