diff --git a/debug/accuracy_tools/kj600/kj600/config.json b/debug/accuracy_tools/kj600/kj600/config.json deleted file mode 100644 index 733a3f0d8f8ea7d858f0a38b5c5cd196c97c6d68..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/kj600/kj600/config.json +++ /dev/null @@ -1,10 +0,0 @@ -// config.json examples -// -// example1: -// { -// "targets": { -// "fc": {"input": "tuple[1]:0", "output": "tensor", "input_grad":"tuple[1]:0", "output_grad":"tuple[1]:0"}, -// "relu": {"input": "tuple[1]:0", "output": "tensor", "input_grad":"tuple[1]:0", "output_grad":"tuple[1]:0"} -// }, -// "module_ranks":"1,2,3,4" -// } \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/msmonitor/README.md similarity index 94% rename from debug/accuracy_tools/kj600/README.md rename to debug/accuracy_tools/msmonitor/README.md index f486dd6e798db2869fefa62cfcffa4b3659f3327..c90d5e431188fbef1260ce46045feea88f5d1dc3 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/msmonitor/README.md @@ -1,4 +1,4 @@ -# TensorProbe (codename:kj600) 模型训练状态监控工具 +# MsMonitor模型训练状态监控工具 ## 简介 @@ -18,29 +18,23 @@ | sqlalchemy | | pymysql | -### 2. 安装 kj600 +### 2. 安装 MsMonitor -方式一:从 git 直接安装 +下载源码安装 -``` -pip install git+https://gitee.com/xiangsen2/kj600.git -``` - -方式二:下载源码安装 - -``` -git clone https://gitee.com/xiangsen2/kj600.git -cd kj600 +```shell +git clone https://gitee.com/ascend/mstt.git +cd mstt/debug/accuracy_tools/msmonitor pip install . ``` # 快速上手 - 下面以Ascend/ModelLink训练框架为例,给出kj600工具的使用方法。 + 下面以Ascend/ModelLink训练框架为例,给出MsMonitor工具的使用方法。 1. 在ModelLink的根目录,创建json配置文件,如llama2_config.json,内容如下: -``` +```json { "targets": { "language_model.encoder.layers.0": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"} @@ -103,7 +97,7 @@ pip install . 对于language_model.embedding.word_embeddings这类输入层,我们不关心输入的情况下,可以不填"input"和"input_grad",监控的状态中不会包含输入的相关信息。config文件示例如下: -``` +```json { "targets": { "language_model.embedding.word_embeddings": {"output": "tensor","output_grad":"tuple[1]:0"} @@ -111,12 +105,12 @@ pip install . } ``` -2. 在训练器中加入代码,开启kj600训练监控。 +2. 在训练器中加入代码,开启MsMonitor训练监控。 例如在ModelLink/pretrain_gpt.py的model_provider GPTModel构造后加入以下代码, **注意优化器类型opt_ty** : - ``` - from kj600.module_hook import TrainerMon + ```python + from msmonitor.module_hook import TrainerMon hooker = TrainerMon("./llama2_config.json", params_have_main_grad=True, opt_ty="Megatron_DistributedOptimizer") # or opt_ty=Megatron_Float16OptimizerWithFloat16Params hooker.hook_modules(model=model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size) ``` @@ -126,36 +120,36 @@ pip install . 如果要监控混合精度优化器的动量和方差, 需要在混合精度优化器构造后加入如下代码。 目前只支持Megatron_DistributedOptimizer, 使用bf16或者fp16混合精度时开启分布式优化器。 或者Megatron_Float16OptimizerWithFloat16Params, 使用bf16或者fp16混合精度选项并且不开启分布式优化器。 - ``` + ```python model, optimizer, opt_param_scheduler = setup_model_and_optimizer( model_provider, model_type) # 插入位置 - from kj600.module_hook import TrainerMon + from msmonitor.module_hook import TrainerMon TrainerMon.set_wrapped_optimizer(optimizer) ``` 3. 配置tensorboard写入的目录 - ``` - export KJ600_OUTPUT_DIR=/xxx/output_dir + ```shell + export MONITOR_OUTPUT_DIR=/xxx/output_dir ``` 4. 开始预训练,在日志中如果发现以下内容, 则说明指定的模块被成功监视。 - ``` + ```txt > language_model.encoder.layers.0 is monitored successfully > 1 out of 1 modules are monitored. ``` 5. 训练过程中,打开tensorboard,可以查看训练的中间状态: -``` -tensorboard --logdir=$KJ600_OUTPUT_DIR +```shell +tensorboard --logdir=$MONITOR_OUTPUT_DIR ``` 之后,运行以下SSH命令来建立端口转发,可以在本地通过http://localhost:6006访问tensorboard: -``` +```shell ssh -N -L localhost:6006:localhost:6006 your_username@remote_server_address ``` diff --git a/debug/accuracy_tools/kj600/img/cpu_info.png b/debug/accuracy_tools/msmonitor/img/cpu_info.png similarity index 100% rename from debug/accuracy_tools/kj600/img/cpu_info.png rename to debug/accuracy_tools/msmonitor/img/cpu_info.png diff --git a/debug/accuracy_tools/kj600/img/train.png b/debug/accuracy_tools/msmonitor/img/train.png similarity index 100% rename from debug/accuracy_tools/kj600/img/train.png rename to debug/accuracy_tools/msmonitor/img/train.png diff --git a/debug/accuracy_tools/kj600/img/train_with_kj600.png b/debug/accuracy_tools/msmonitor/img/train_with_kj600.png similarity index 100% rename from debug/accuracy_tools/kj600/img/train_with_kj600.png rename to debug/accuracy_tools/msmonitor/img/train_with_kj600.png diff --git a/debug/accuracy_tools/kj600/kj600/__init__.py b/debug/accuracy_tools/msmonitor/msmonitor/__init__.py similarity index 100% rename from debug/accuracy_tools/kj600/kj600/__init__.py rename to debug/accuracy_tools/msmonitor/msmonitor/__init__.py diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/msmonitor/msmonitor/anomaly_detect.py similarity index 74% rename from debug/accuracy_tools/kj600/kj600/anomaly_detect.py rename to debug/accuracy_tools/msmonitor/msmonitor/anomaly_detect.py index 7d3389a8508c3663efa6987d62a9269f4b715ab9..e7581a5ac55f0af4351dfdeb91137f1f6c21b79b 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/anomaly_detect.py @@ -1,21 +1,25 @@ import os -import statistics as st +import statistics as st from abc import ABC from typing import List import sys -from torch.utils.tensorboard import SummaryWriter from collections import defaultdict -from kj600.utils import print_info_log, print_error_log -from kj600.file_check import check_path_before_create, change_mode, FileCheckConst, create_directory +from torch.utils.tensorboard import SummaryWriter +from msmonitor.utils import print_info_log, print_warn_log, print_error_log +from msmonitor.file_check import check_path_before_create, change_mode, FileCheckConst, create_directory + class ScanRule(ABC): def apply(self, history, cur): raise NotImplementedError("abstract method apply is not implemented") + class AnomalyTurbulence(ScanRule): name = "AnomalyTurbulence" + def __init__(self, threshold) -> None: self.threshold = threshold + def apply(self, history, cur): baseline = st.mean(history) if isinstance(history, list) else history @@ -25,6 +29,7 @@ class AnomalyTurbulence(ScanRule): else: return cur < up_bound + class AnomalyScanner: @staticmethod @@ -33,9 +38,9 @@ class AnomalyScanner: return [] alert_rules = [] for spec in specs: - rule_cls_name = spec["rule_name"] - rule_args = spec["args"] - cur_module = sys.modules[__name__] + rule_cls_name = spec.get("rule_name") + rule_args = spec.get("args") + cur_module = sys.modules.get(__name__) rule_cls = getattr(cur_module, rule_cls_name) rule_instance = rule_cls(**rule_args) alert_rules.append(rule_instance) @@ -47,10 +52,11 @@ class AnomalyScanner: for rule in scan_rules: anomaly = rule.apply(history, cur) if anomaly: - return anomaly, rule.name - return anomaly, None + break + return anomaly, rule.name -class bcolors: + +class BColor: HEADER = '\033[95m' OKBLUE = '\033[94m' OKCYAN = '\033[96m' @@ -61,6 +67,7 @@ class bcolors: BOLD = '\033[1m' UNDERLINE = '\033[4m' + class SummaryWriterWithAD(SummaryWriter): def __init__(self, path, ad_rules, job_id, anomaly_inform=False): check_path_before_create(path) @@ -77,20 +84,22 @@ class SummaryWriterWithAD(SummaryWriter): self.job_id = job_id self.anomaly_inform = anomaly_inform - def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): + def add_scalar(self, tag, scalar_value, global_step=None): new_avg = avg = scalar_value if tag in self.tag2scalars: - N = len(self.tag2scalars[tag]) + steps = len(self.tag2scalars[tag]) _, avg = self.tag2scalars[tag][-1] - new_avg = (avg*N + scalar_value)/(N + 1) + new_avg = (avg*steps + scalar_value) / (steps + 1) self.tag2scalars[tag].append((scalar_value, new_avg)) detected, rule_name = self._ad(scalar_value, history=avg) if detected: - print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") - exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" + print_info_log(f"{BColor.WARNING}> Rule {rule_name} reports anomaly signal in {tag} \ + at step {global_step}.{BColor.ENDC}") + exception_message = f"{BColor.WARNING}> Rule {rule_name} reports anomaly signal in {tag} \ + at step {global_step}.{BColor.ENDC}" if self.anomaly_inform: self.anomaly_inform.run(exception_message, self.job_id) - return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) + return super().add_scalar(tag, scalar_value, global_step) def _ad(self, scalar_value, history): return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_inform.py b/debug/accuracy_tools/msmonitor/msmonitor/anomaly_inform.py similarity index 84% rename from debug/accuracy_tools/kj600/kj600/anomaly_inform.py rename to debug/accuracy_tools/msmonitor/msmonitor/anomaly_inform.py index 485c06d4d24ed4279f73f2ec3c4a8acd4268192d..9562a342afab4d8d484df6813036c006affb0f92 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_inform.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/anomaly_inform.py @@ -1,22 +1,22 @@ import smtplib from email.mime.text import MIMEText -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone -from kj600.database import Database, ExceptionMessage +from msmonitor.database import Database, ExceptionMessage # define class InformRegistry to get inform_sub_class class AnomalyInformFactory: @staticmethod def create_informer(**kwargs): - if kwargs['recipient'] == "database": + if kwargs.get('recipient') == "database": return DatabaseInform(**kwargs) - elif kwargs['recipient'] == "email": + elif kwargs.get('recipient') == "email": return EmailInform(**kwargs) else: raise ValueError("Invaild recipient specified") -# define class AnomalyInform to inform with database or email + class AnomalyInform: def __init__(self, **kwargs): self.inform_args = kwargs @@ -29,16 +29,17 @@ class AnomalyInform: def run(self, exception_message, job_id): if self.time != 0 and self.current_time == 0: - self.current_time = datetime.now() + self.current_time = datetime.now(tz = timezone.utc) if self.time == 0 or ((self.current_time - self.time) > timedelta(minutes=self.interval_time)): self.exception_message_list.append(exception_message) self.inform_fun(self.exception_message_list, job_id) self.exception_message_list = [] - self.time = datetime.now() + self.time = datetime.now(tz = timezone.utc) elif (self.current_time - self.time) <= timedelta(minutes=self.interval_time): self.exception_message_list.append(exception_message) - self.current_time = datetime.now() + self.current_time = datetime.now(tz = timezone.utc) + class DatabaseInform(AnomalyInform): def __init__(self, **kwargs): super().__init__(**kwargs) @@ -49,10 +50,11 @@ class DatabaseInform(AnomalyInform): def inform_fun(self, exception_message_list, job_id): save_list = [] for exception_message in exception_message_list: - item = {'job_id': job_id, 'message': exception_message, 'create_time': datetime.now()} + item = {'job_id': job_id, 'message': exception_message, 'create_time': datetime.now(tz = timezone.utc)} save_list.append(ExceptionMessage(**item)) self.database.insert_batch(save_list) + class EmailInform(AnomalyInform): def __init__(self, **kwargs): super().__init__(**kwargs) @@ -68,7 +70,8 @@ class EmailInform(AnomalyInform): message["From"] = self.inform_args.get('send_email_address', None) message["To"] = self.inform_args.get('receive_email_address', None) - with smtplib.SMTP(self.inform_args.get('smtp_server', None), self.inform_args.get('smtp_port', 587)) as server: + with smtplib.SMTP(self.inform_args.get('smtp_server', None), + self.inform_args.get('smtp_port', 587)) as server: server.starttls() server.login(self.inform_args.get('send_email_username', None), self.inform_args.get('send_email_password', None)) server.sendmail(self.inform_args.get('send_email_address', None), diff --git a/debug/accuracy_tools/kj600/kj600/const.py b/debug/accuracy_tools/msmonitor/msmonitor/const.py similarity index 100% rename from debug/accuracy_tools/kj600/kj600/const.py rename to debug/accuracy_tools/msmonitor/msmonitor/const.py diff --git a/debug/accuracy_tools/kj600/kj600/database.py b/debug/accuracy_tools/msmonitor/msmonitor/database.py similarity index 95% rename from debug/accuracy_tools/kj600/kj600/database.py rename to debug/accuracy_tools/msmonitor/msmonitor/database.py index ce02ab7429d066dc76c127eefab5c1f6720d612c..3b2ecf4e619b36e23c72615a7d8bd6fd4ae65ca6 100644 --- a/debug/accuracy_tools/kj600/kj600/database.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/database.py @@ -26,8 +26,8 @@ class Database: self.engine = create_engine(connection_str, pool_recycle=25200) def get_session(self): - Session = sessionmaker(bind=self.engine) - return Session() + session = sessionmaker(bind=self.engine) + return session() def create_table(self): Base.metadata.create_all(self.engine, checkfirst=True) diff --git a/debug/accuracy_tools/kj600/kj600/distributed/distributed_ops.yaml b/debug/accuracy_tools/msmonitor/msmonitor/distributed/distributed_ops.yaml similarity index 100% rename from debug/accuracy_tools/kj600/kj600/distributed/distributed_ops.yaml rename to debug/accuracy_tools/msmonitor/msmonitor/distributed/distributed_ops.yaml diff --git a/debug/accuracy_tools/kj600/kj600/distributed/stack_blacklist.yaml b/debug/accuracy_tools/msmonitor/msmonitor/distributed/stack_blacklist.yaml similarity index 76% rename from debug/accuracy_tools/kj600/kj600/distributed/stack_blacklist.yaml rename to debug/accuracy_tools/msmonitor/msmonitor/distributed/stack_blacklist.yaml index 00b0013619fcfa1445a8df18c3c7d16764fb4872..777ebbd1721328b461783b37253ab92478f622a9 100644 --- a/debug/accuracy_tools/kj600/kj600/distributed/stack_blacklist.yaml +++ b/debug/accuracy_tools/msmonitor/msmonitor/distributed/stack_blacklist.yaml @@ -1,5 +1,5 @@ stack: -- kj600/distributed +- msmonitor/distributed - site-packages/torch/nn/modules/module.py - multiprocessing - debugpy \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py b/debug/accuracy_tools/msmonitor/msmonitor/distributed/wrap_distributed.py similarity index 92% rename from debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py rename to debug/accuracy_tools/msmonitor/msmonitor/distributed/wrap_distributed.py index 77fd7924f937487feca6a300fa9a9023a26b3c4b..cd6d843ef9b34ffc9e1aaab6e28a3dd64d9f73cd 100644 --- a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/distributed/wrap_distributed.py @@ -1,13 +1,12 @@ import os -import yaml import re import inspect +import yaml import torch import torch.nn as nn import torch.distributed as dist -from kj600.utils import print_error_log - -from ..module_metric import get_metrics +from msmonitor.utils import print_error_log +from msmonitor.module_metric import get_metrics try: import torch_npu @@ -86,6 +85,21 @@ class ApiRegistry: else: setattr(api_group, cc_api_name, cc_api_entry_func) + @staticmethod + def redirect_wait(): + global ORIGIN_WAIT + global PENDING_ASYNC_CC_BY_HANDLE + + def wrapped_wait(work): + def wrapped_wait(*args, **kwargs): + ORIGIN_WAIT(*args, **kwargs) + if args[0] in PENDING_ASYNC_CC_BY_HANDLE: + store_func = PENDING_ASYNC_CC_BY_HANDLE.pop(args[0]) + store_func() + return wrapped_wait + + dist.Work.wait = wrapped_wait(dist.Work) + def redirect_api(self): self.set_api_attr(dist, self.distributed_attr_hooked) self.set_api_attr(dist.distributed_c10d, self.distributed_attr_hooked) @@ -101,20 +115,6 @@ class ApiRegistry: for op_name in get_distributed_ops(): self.distributed_attr_hooked[op_name] = DistributedOPTemplate(op_name, pre_hooks, post_hooks) - def redirect_wait(self): - global ORIGIN_WAIT - global PENDING_ASYNC_CC_BY_HANDLE - - def wrapped_wait(work): - def wrapped_wait(*args, **kwargs): - ORIGIN_WAIT(*args, **kwargs) - if args[0] in PENDING_ASYNC_CC_BY_HANDLE: - store_func = PENDING_ASYNC_CC_BY_HANDLE.pop(args[0]) - store_func() - return wrapped_wait - - dist.Work.wait = wrapped_wait(dist.Work) - def stack_filter(stack): for pattern in StackBlackList: @@ -122,14 +122,16 @@ def stack_filter(stack): return False return True + def get_callstack(): callstack = [] - for (_, path, line, func, code, _) in inspect.stack(): + for (_, path, line, func, _, _) in inspect.stack(): stack_line = f'{path}[{line}]' if stack_filter(stack_line): callstack.append(stack_line+' '+func) return callstack + @torch.no_grad() def op_aggregate(op, tensorlist): if isinstance(tensorlist, torch.Tensor): @@ -146,6 +148,7 @@ def op_aggregate(op, tensorlist): return sum(tensorlist) / len(tensorlist) if len(tensorlist) != 0 else 0 return torch.nan + def update_data(old, new): for op, tag2tensorlist in new.items(): if op not in old: @@ -168,6 +171,7 @@ def is_target_line(codeline): return True return False + @torch.no_grad() def catch_data(cc_context, ops, args, prefix): tensor_args = {} @@ -182,13 +186,15 @@ def catch_data(cc_context, ops, args, prefix): tensor_args[f'{prefix}_{len(tensor_args)}'] = stacked_arg new_data = {op: get_metrics(op, tensor_args, 1e-8) for op in ops} - cc_context.data=update_data(cc_context.data, new_data) + cc_context.data = update_data(cc_context.data, new_data) + def create_async_callback_func(context, ops, args, prefix): def store_data(): catch_data(context, ops, args, prefix) return store_data + def get_tensor_dtype(args): dtypes = [] for arg in args: @@ -231,10 +237,12 @@ def create_hooks(context, monitor): args = args + tuple(kwargs.values()) if out: # async if isinstance(out, dist.Work): - PENDING_ASYNC_CC_BY_HANDLE[out] = create_async_callback_func(context[module.op_name_], monitor.ops, args, PREFIX_POST) + PENDING_ASYNC_CC_BY_HANDLE[out] = create_async_callback_func(context[module.op_name_], + monitor.ops, args, PREFIX_POST) elif isinstance(out, list): # batch_isend_irecv - for o in out: - PENDING_ASYNC_CC_BY_HANDLE[o] = create_async_callback_func(context[module.op_name_], monitor.ops, args, PREFIX_POST) + for iout in out: + PENDING_ASYNC_CC_BY_HANDLE[iout] = create_async_callback_func(context[module.op_name_], + monitor.ops, args, PREFIX_POST) return out catch_data(context[module.op_name_], monitor.ops, args, PREFIX_POST) return out diff --git a/debug/accuracy_tools/kj600/kj600/features.py b/debug/accuracy_tools/msmonitor/msmonitor/features.py similarity index 79% rename from debug/accuracy_tools/kj600/kj600/features.py rename to debug/accuracy_tools/msmonitor/msmonitor/features.py index 7810188f7d7df66dce4c489f18062f9381b95646..764c14c19115ecfeee1c5ed4935c11893e1ee01e 100644 --- a/debug/accuracy_tools/kj600/kj600/features.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/features.py @@ -1,20 +1,23 @@ import torch from torch.autograd.functional import jacobian -from kj600.utils import print_info_log +from msmonitor.utils import print_info_log @torch.no_grad() def square_sum(x: torch.tensor): return (x * x).sum() + @torch.no_grad() def get_min(x: torch.tensor): return torch.min(x) + @torch.no_grad() def get_norm(x: torch.tensor): return torch.norm(x, p=2) + @torch.no_grad() def get_max(x: torch.tensor): return torch.max(x) @@ -24,6 +27,7 @@ def get_max(x: torch.tensor): def get_zeros(x: torch.tensor, eps: float): return torch.sum(torch.abs(x) < eps) / x.numel() + @torch.no_grad() def get_sign_matches(x: torch.tensor, y:torch.tensor): xs = x.sign() @@ -35,35 +39,36 @@ def get_sign_matches(x: torch.tensor, y:torch.tensor): same_direction_ratio = torch.tensor(0.) return same_direction_ratio + @torch.no_grad() def eff_rank(param: torch.tensor, threshold=1e-10): - U, S, Vh = torch.linalg.svd(param.float()) - rank = torch.sum(S > threshold) + u, s, vh = torch.linalg.svd(param.float()) + rank = torch.sum(s > threshold) return rank # modular neural tangent kernel @torch.no_grad() -def mNTK(module: torch.nn.Module, x: torch.tensor): - J_theta_l = jacobian(module, x) - mntk = torch.matmul(J_theta_l, J_theta_l.t()) +def m_ntk(module: torch.nn.Module, x: torch.tensor): + j_theta_l = jacobian(module, x) + mntk = torch.matmul(j_theta_l, j_theta_l.t()) return mntk @torch.no_grad() -def power_iteration(A, num_iterations): - b = torch.randn(A.size(1), 1) +def power_iteration(a, num_iterations): + b = torch.randn(a.size(1), 1) for _ in range(num_iterations): - b = torch.matmul(A, b) + b = torch.matmul(a, b) b_norm = torch.norm(b) b = b / b_norm if b_norm != 0 else 0 - eigval = torch.matmul(torch.matmul(b.t(), A), b) + eigval = torch.matmul(torch.matmul(b.t(), a), b) return eigval @torch.no_grad() def lambda_max_subsample(module: torch.nn.Module, x: torch.tensor, num_iterations=100, subsample_size=None): - mntk = mNTK(module, x) + mntk = m_ntk(module, x) if subsample_size is None: subsample_size = min(mntk.size(0), mntk.size(1)) idx = torch.randperm(mntk.size(0))[:subsample_size] @@ -77,6 +82,7 @@ def lambda_max_subsample(module: torch.nn.Module, x: torch.tensor, num_iteration def cal_histc(tensor_cal, bins_total, min_val, max_val): return torch.histc(tensor_cal, bins=bins_total, min=min_val, max=max_val) + @torch.no_grad() def get_nans(t): return torch.isnan(t).sum() diff --git a/debug/accuracy_tools/kj600/kj600/file_check.py b/debug/accuracy_tools/msmonitor/msmonitor/file_check.py similarity index 99% rename from debug/accuracy_tools/kj600/kj600/file_check.py rename to debug/accuracy_tools/msmonitor/msmonitor/file_check.py index c567f94545e2ed946e3335ce34ae1757046e2efa..642545948c38d108b54ad1491eaddcb4e0bd590f 100644 --- a/debug/accuracy_tools/kj600/kj600/file_check.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/file_check.py @@ -187,7 +187,7 @@ def check_path_owner_consistent(path): def check_path_pattern_vaild(path): if not re.match(FileCheckConst.FILE_VALID_PATTERN, path): - logger.error('The file path %s contains special characters.' %(path)) + logger.error('The file path %s contains special characters.' % path) raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/msmonitor/msmonitor/module_hook.py similarity index 81% rename from debug/accuracy_tools/kj600/kj600/module_hook.py rename to debug/accuracy_tools/msmonitor/msmonitor/module_hook.py index 193e5acb01a5f5c1079d1d6d82d84edfff0ec219..3b011b770a2df70ffefcbebb672e43c47784fecf 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/module_hook.py @@ -2,20 +2,21 @@ import os import uuid import json from collections import defaultdict -from datetime import datetime +from datetime import datetime, timezone import torch import torch.distributed as dist from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook -from kj600.module_spec_verifier import validate_config_spec -from kj600.optimizer_collect import MixPrecsionOptimizerMon, OptimizerMonFactory -from kj600.features import eff_rank, get_sign_matches -from kj600.visualizer import HeatmapVisualizer -from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD -from kj600.anomaly_inform import AnomalyInformFactory -from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics -from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate -from kj600.utils import print_warn_log, print_info_log, print_rank_0, get_param_struct, check_path_length, check_path_pattern_valid, change_mode, FileCheckConst, validate_config -from kj600.file_check import FileOpen +from msmonitor.module_spec_verifier import validate_config_spec +from msmonitor.optimizer_collect import MixPrecsionOptimizerMon, OptimizerMonFactory +from msmonitor.features import eff_rank, get_sign_matches +from msmonitor.visualizer import HeatmapVisualizer +from msmonitor.anomaly_detect import AnomalyScanner, SummaryWriterWithAD +from msmonitor.anomaly_inform import AnomalyInformFactory +from msmonitor.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics +from msmonitor.distributed.wrap_distributed import api_register, create_hooks, op_aggregate +from msmonitor.utils import print_warn_log, print_info_log, print_rank_0, get_param_struct, check_path_length, \ + check_path_pattern_valid, change_mode, FileCheckConst, validate_config +from msmonitor.file_check import FileOpen @@ -45,7 +46,7 @@ class OptimizerContext: def __init__(self) -> None: self.step = 0 self.param_effective_rank = defaultdict(float) - self.param_mg_direction = defaultdict(float) + self.param_mg_direction = defaultdict(float) self.param_adam_update = defaultdict() self.param_adam_ratio = defaultdict() self.param_weight_grad = defaultdict() @@ -73,11 +74,11 @@ class CommunicationContext: def aggregate(self): self.data = self._agg(self.data) + class TrainerMon: tensor_metrics = TensorMetrics() - # opt_ty: "Megatron_Float16OptimizerWithFloat16Params" or "Megatron_DistributedOptimizer" def __init__(self, config_file_path, params_have_main_grad=True, opt_ty=None) -> None: self.module_fwd_hook_context_by_module = defaultdict(ModuleHookContext) self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext) @@ -127,11 +128,14 @@ class TrainerMon: alert_setting = self.config.get('alert', {"rules":[]}) self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"]) - anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None + if "inform" in alert_setting: + anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) + else: + anomaly_inform = None self.optimizer_hooked = False - output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') - cur_time = datetime.now().strftime('%b%d_%H-%M-%S') + output_base_dir = os.getenv('MONITOR_OUTPUT_DIR', './msmonitor_output') + cur_time = datetime.now(tz = timezone.utc).strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] if dist.is_initialized(): cur_path = os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}") @@ -147,7 +151,7 @@ class TrainerMon: self.summary_writer = SummaryWriterWithAD(cur_path, self.alert_rules, unique_id, anomaly_inform) full_path = os.path.realpath(cur_path) - change_mode(full_path,FileCheckConst.DATA_DIR_AUTHORITY) + change_mode(full_path, FileCheckConst.DATA_DIR_AUTHORITY) # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) @@ -183,11 +187,40 @@ class TrainerMon: rank = dist.get_rank() if (rank not in rank_list) and len(rank_list) != 0: return - TrainerMon.tensor_metrics.stat_insert(target_tensor, ops_list, module_name, tensor_name, rank) + tag = get_summary_writer_tag_name(module_name, tensor_name, rank) + TrainerMon.tensor_metrics.stat_insert(target_tensor, ops_list, tag) + + @staticmethod + def build_tbtag_tensor_map(module_name, tag, tensor): + metrics = {} + rank = dist.get_rank() if dist.is_initialized() else None + key = get_summary_writer_tag_name(module_name, tag, rank) + if tensor is not None: + metrics[key] = tensor + return metrics + + @staticmethod + def generate_cc_metrics(cc_name, cc_tensor): + metrics = defaultdict(dict) + rank = dist.get_rank() if dist.is_initialized() else None + for op, tag2tensor in cc_tensor.data.items(): + for tag, tensor in tag2tensor.items(): + key = get_summary_writer_tag_name(cc_name, tag, rank) + metrics[op].update({key: tensor}) + cc_tensor.reset() + return metrics + + def generate_param_metrics(self, tag, param_tensor): + metrics = {} + rank = dist.get_rank() if dist.is_initialized() else None + for _, name in self.param2name.items(): + key = get_summary_writer_tag_name(name, tag, rank) + if name not in param_tensor or param_tensor[name] is None: + continue + metrics[key] = param_tensor[name] + return metrics def hook_modules(self, model:torch.nn.Module, grad_acc_steps): - # fwd=0, bkd=1 - # targets is module name list like ["xx.xxx1", "xxx.xxx2"] which can be obtained when first run. if not isinstance(model, torch.nn.Module): raise TypeError("model should be a nn.Module") if not isinstance(grad_acc_steps, int) or isinstance(grad_acc_steps, bool): @@ -211,40 +244,12 @@ class TrainerMon: for name, param in model.named_parameters(): print_rank_0(f"\t{name}") for target_module, _ in self.config['targets'].items(): - if name.startswith(target_module): # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 + if name.startswith(target_module): self.param_name_list.append(name) self.param2name[param] = name self.hook_optimizer() return - def build_tbtag_tensor_map(self, module_name, tag, tensor): - metrics = {} - rank = dist.get_rank() if dist.is_initialized() else None - key = get_summary_writer_tag_name(module_name, tag, rank) - if tensor is not None: - metrics[key] = tensor - return metrics - - def generate_param_metrics(self, tag, param_tensor): - metrics = {} - rank = dist.get_rank() if dist.is_initialized() else None - for param, name in self.param2name.items(): - key = get_summary_writer_tag_name(name, tag, rank) - if name not in param_tensor or param_tensor[name] is None: - continue - metrics[key] = param_tensor[name] - return metrics - - def generate_cc_metrics(self, cc_name, cc_tensor): - metrics = defaultdict(dict) - rank = dist.get_rank() if dist.is_initialized() else None - for op, tag2tensor in cc_tensor.data.items(): - for tag, tensor in tag2tensor.items(): - key = get_summary_writer_tag_name(cc_name, tag, rank) - metrics[op].update({key: tensor}) - cc_tensor.reset() - return metrics - def write_adhoc_check(self, step): TrainerMon.tensor_metrics.flush(self.summary_writer) @@ -253,7 +258,8 @@ class TrainerMon: return for _, fwd_context in self.module_fwd_hook_context_by_module.items(): if not len(fwd_context.actv) == self.micro_batch_number: - print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") + print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, \ + {self.micro_batch_number}") for metric_name in self.ops: write_metrics_tensorboard(metric_name, self.summary_writer, fwd_context.actv, step) fwd_context.actv.clear() @@ -269,7 +275,7 @@ class TrainerMon: # in DDP by default use params_have_main_grad def optimizer_pre_step_hook(optimizer, args, kwargs): context = self.optimizer_context[optimizer] - if self.print_struct and not all(value == {} for value in self.module_struct.values()) and not self.struct_printed: + if self.print_struct and not all(value == {} for value in self.module_struct. values()) and not self.struct_printed: self._smallest_rank_print("> module struct:") self._smallest_rank_print(json.dumps(self.module_struct)) self.struct_printed = True @@ -277,12 +283,15 @@ class TrainerMon: raise Exception("exit after first step when print model struct") if self.cc_log_only and context.step > 0: self._smallest_rank_print("> Used communication ops and corresponding stack") - self._smallest_rank_print(json.dumps({k:[i.split(';') for i in v] for k,v in self.cc_logged_stack.items()}, indent=4)) + self._smallest_rank_print(json.dumps({k:[i.split(';') for i in v] for k, v in self.cc_logged_stack.items()}, + indent=4)) raise Exception("exit after first step when print cc stack") - - context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self, - optimizer, self.param2name) + opt_status = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name) + context.param_exp_avg = opt_status.exp_avg + context.param_exp_avg_sq = opt_status.exp_avg_sq + context.param_adam_update = opt_status.update + context.param_adam_ratio = opt_status.ratio for param, name in self.param2name.items(): if "params_effrank" in self.config and name in self.config["params_effrank"]: @@ -308,8 +317,6 @@ class TrainerMon: tbtag_tensor_map.update(self.generate_param_metrics('exp_avg_sq', context.param_exp_avg_sq)) if self.mg_direction: tbtag_tensor_map.update(self.generate_param_metrics('mg_direction', context.param_mg_direction)) - # if not tbtag_tensor_map: - # return metric_dict = {} for metric_name in self.ops: metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) @@ -317,7 +324,7 @@ class TrainerMon: c.aggregate() cc_metrics = self.generate_cc_metrics(k, c) for op, m in cc_metrics.items(): - metric_dict[op].update(m) + metric_dict.get(op).update(m) if not metric_dict: return context.metric_list.append(metric_dict) @@ -332,9 +339,11 @@ class TrainerMon: if self.ur_distribution: for param_name, _ in context.param_adam_update.items(): - self.update_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_update', rank), context.step, self.summary_writer) + tag = get_summary_writer_tag_name(param_name, 'adam_update', rank) + self.update_heatmap_visualizer[param_name].visualize(tag, context.step, self.summary_writer) for param_name, _ in context.param_adam_ratio.items(): - self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer) + tag = get_summary_writer_tag_name(param_name, 'adam_ratio', rank) + self.ratio_heatmap_visualizer[param_name].visualize(tag, context.step, self.summary_writer) for metric_name in self.ops: if not context.metric_list: @@ -368,7 +377,7 @@ class TrainerMon: def fwd_hook_fun(module, module_input, module_output): context: ModuleHookContext = self.module_fwd_hook_context_by_module[module] if self.print_struct: - self.module_struct[context.module_name].update( + self.module_struct.get(context.module_name).update( {"input": f"{get_param_struct(module_input)}", "output": f"{get_param_struct(module_output)}"}) return if not self.xy_distribution: @@ -378,8 +387,14 @@ class TrainerMon: context.set_format_by_arg('output', self.config['targets']) if not context.verified: if not context.ignore_in: - context.focused_in_col = validate_config_spec(context.format_by_arg['input'], module_input, context.module_name, 'input') - context.focused_out_col = validate_config_spec(context.format_by_arg['output'], module_output, context.module_name, 'output') + context.focused_in_col = validate_config_spec(context.format_by_arg['input'], + module_input, + context.module_name, + 'input') + context.focused_out_col = validate_config_spec(context.format_by_arg['output'], + module_output, + context.module_name, + 'output') context.verified = True # expect output be tensor type tbtag_tensor_map = {} @@ -393,7 +408,8 @@ class TrainerMon: metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) if context.micro_step == 0 and context.actv: print_warn_log( - f"actv context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.") + f"actv context of {context.module_name} is not empty when first micro_step, \ + maybe something wrong happened. Now clear it.") context.actv.clear() context.actv.append(metric_dict) @@ -406,8 +422,9 @@ class TrainerMon: def bwd_hook_fun(module, input_grad, output_grad): context: ModuleHookContext = self.module_bwd_hook_context_by_module[module] if self.print_struct: - self.module_struct[context.module_name].update( - {"input_grad": f"{get_param_struct(input_grad)}", "output_grad": f"{get_param_struct(output_grad)}"}) + self.module_struct.get(context.module_name).update( + {"input_grad": f"{get_param_struct(input_grad)}", + "output_grad": f"{get_param_struct(output_grad)}"}) return if not self.xy_distribution: return @@ -416,21 +433,32 @@ class TrainerMon: context.set_format_by_arg('output_grad', self.config['targets']) if not context.verified: if not context.ignore_in: - context.focused_in_col = validate_config_spec(context.format_by_arg['input_grad'], input_grad, context.module_name, 'input_grad') - context.focused_out_col = validate_config_spec(context.format_by_arg['output_grad'], output_grad, context.module_name, 'output_grad') + context.focused_in_col = validate_config_spec(context.format_by_arg['input_grad'], + input_grad, + context.module_name, + 'input_grad') + context.focused_out_col = validate_config_spec(context.format_by_arg['output_grad'], + output_grad, + context.module_name, + 'output_grad') context.verified = True tbtag_tensor_map = {} if not context.ignore_in: cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col] - tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'input_grad', cared_input_grad)) + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, + 'input_grad', + cared_input_grad)) cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col] - tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'output_grad', cared_output_grad)) + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, + 'output_grad', + cared_output_grad)) metric_dict = {} for metric_name in self.ops: metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) if context.micro_step == 0 and context.actvgrad: - print_warn_log(f"actvgrad context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.") + print_warn_log(f"actvgrad context of {context.module_name} is not empty when first micro_step, \ + maybe something wrong happened. Now clear it.") context.actvgrad.clear() context.actvgrad.append(metric_dict) diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/msmonitor/msmonitor/module_metric.py similarity index 85% rename from debug/accuracy_tools/kj600/kj600/module_metric.py rename to debug/accuracy_tools/msmonitor/msmonitor/module_metric.py index e09536b072cf7953e6b6106420936416d4264d0e..84daa38d414994f3a4d48ce33a565bae013726b1 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/module_metric.py @@ -1,7 +1,7 @@ import math import statistics -from kj600.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm +from msmonitor.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): @@ -11,30 +11,28 @@ def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): return f"{module_or_param_name}/{rank}/{tag}" -# 用于存储所有metric实现类的注册表 config_metric_registry = {} def register_config_metric(key, cls=None): """装饰器 用于注册Metric的实现类""" if cls is None: - # 无参数时,返回装饰器函数 return lambda cls: register_config_metric(key, cls) config_metric_registry[key] = cls return cls + class TensorMetrics: def __init__(self) -> None: - self.metrics = {} #tensor_tag --> [] + self.metrics = {} self.cur_idx = {} fun_map = {"norm": get_norm, "max": get_max, "min": get_min} - #get stats and insert into metrics dictionary - def stat_insert(self, tensor, stat_ops, module_name, tensor_name, rank, eps=1e-8): - prefix = get_summary_writer_tag_name(module_name, tensor_name, rank) + + def stat_insert(self, tensor, stat_ops, tag, eps=1e-8): for stat_op in stat_ops: y = TensorMetrics.fun_map[stat_op](tensor) - key = f"{prefix}_{stat_op}" + key = f"{tag}_{stat_op}" if key not in self.metrics: self.metrics[key] = [] self.cur_idx[key] = 0 @@ -47,6 +45,7 @@ class TensorMetrics: tb_writer.add_scalar(key, v.item(), global_step=self.cur_idx[key]) self.cur_idx[key] += 1 + class Metric(object): @staticmethod def get_metric_value(tensor, eps): @@ -62,6 +61,7 @@ class Metric(object): metrics_dict[tag] = self.get_metric_value(tensor, eps) return metrics_dict + @register_config_metric("min") class MinMetric(Metric): @staticmethod @@ -113,11 +113,12 @@ class ZerosMetric(Metric): zeros_value = statistics.mean([item[metric_name][key].item() for item in metric_value]) summary_writer.add_scalar(f'{key}_zeros', zeros_value, step) + @register_config_metric("nans") class NaNsMetric(Metric): @staticmethod - def get_metric_value(t, eps): - return get_nans(t) + def get_metric_value(tensor, eps): + return get_nans(tensor) @staticmethod def metric_tensorboard(metric_name, summary_writer, metric_value, step): @@ -125,16 +126,18 @@ class NaNsMetric(Metric): nans_value = sum([v[metric_name][key].item() for v in metric_value]) summary_writer.add_scalar(f'{key}_nans', nans_value, step) + @register_config_metric("id") class IdentMetric(Metric): @staticmethod def get_metric_value(tensor, eps): + multi_dim = None if tensor.dim() != 0: - return None + return multi_dim return tensor @staticmethod - def metric_tensorboard(metric_name, summary_writer, metric_value, step): #metric_value is a dict, key is parameter name and value is a list of scalar tensor + def metric_tensorboard(metric_name, summary_writer, metric_value, step): if len(metric_value) == 1: for key, value in metric_value[0][metric_name].items(): if not value: @@ -147,7 +150,8 @@ def get_metrics(metric_name, tag2tensor, eps): fun_metric = config_metric_registry[metric_name] return fun_metric().get_metrics(tag2tensor, eps) except KeyError as e: - raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, \ + actual metric: {metric_name}") from e def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step): @@ -155,4 +159,5 @@ def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step): fun_metric = config_metric_registry[metric_name] return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) except KeyError as e: - raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, \ + actual metric: {metric_name}") from e diff --git a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py b/debug/accuracy_tools/msmonitor/msmonitor/module_spec_verifier.py similarity index 77% rename from debug/accuracy_tools/kj600/kj600/module_spec_verifier.py rename to debug/accuracy_tools/msmonitor/msmonitor/module_spec_verifier.py index 66ea2805907cf436e43864ec57232eb6722c3cdf..85851199fbc9881dbc2c05761974fa24544598bb 100644 --- a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/module_spec_verifier.py @@ -31,9 +31,11 @@ class TensorValidator(ConfigValidator): return pattern.match(config_spec) def validate(self, actual_data, module_name:str, data_type:str, pattern_match): + index = None if not torch.is_tensor(actual_data): - raise ValueError(f"Format of {module_name} {data_type} does not match the required format 'tensor' in config.") - return None + raise ValueError(f"Format of {module_name} {data_type} does not match \ + the required format 'tensor' in config.") + return index @register_config_validator @@ -45,11 +47,14 @@ class TupleValidator(ConfigValidator): def validate(self, actual_data, module_name: str, data_type: str, pattern_match): length, index = map(int, pattern_match.groups()) if not (0 <= index < length): - raise ValueError(f"Format of {module_name} {data_type} in config.json does not match the required format 'tuple[x]:y'. y must be greater than or equal to 0 and less than x.") + raise ValueError(f"Format of {module_name} {data_type} in config.json does not match the \ + required format 'tuple[x]:y'. y must be greater than or equal to 0 and less than x.") if not isinstance(actual_data, tuple): - raise ValueError(f"Type of {module_name} {data_type} does not match spec of config.json, should be tuple, please check.") + raise ValueError(f"Type of {module_name} {data_type} does not match spec of config.json, \ + should be tuple, please check.") if len(actual_data) != length: - raise ValueError(f"Length of {module_name} {data_type} does not match spec of config.json, should be {length}, actual is {len(actual_data)} please check.") + raise ValueError(f"Length of {module_name} {data_type} does not match spec of config.json, \ + should be {length}, actual is {len(actual_data)} please check.") return index @@ -60,4 +65,5 @@ def validate_config_spec(config_spec:str, actual_data, module_name:str, data_typ if pattern_match: focused_col = config_validator.validate(actual_data, module_name, data_type, pattern_match) return focused_col - raise ValueError(f"config spec in {module_name} {data_type} not supported, expected spec:'tuple\[(\d+)\]:(\d+)' or 'tensor', actual spec: {config_spec}.") + raise ValueError(f"config spec in {module_name} {data_type} not supported, \ + expected spec:'tuple\[(\d+)\]:(\d+)' or 'tensor', actual spec: {config_spec}.") diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/msmonitor/msmonitor/optimizer_collect.py similarity index 87% rename from debug/accuracy_tools/kj600/kj600/optimizer_collect.py rename to debug/accuracy_tools/msmonitor/msmonitor/optimizer_collect.py index b5337aa01a59ea8353a65884f4e9676c082d9924..be2d797019c745d436f835c995ff196ed8781d74 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/optimizer_collect.py @@ -1,8 +1,10 @@ -from collections import defaultdict +from collections import defaultdict, namedtuple import torch import torch.distributed as dist +MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio")) + class MixPrecsionOptimizerMon: wrapped_optimizer = None @@ -47,14 +49,15 @@ class MixPrecsionOptimizerMon: ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq) monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name]) monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) - return exp_avg_dict, exp_avg_sq_dict, update_dict, ratio_dict + return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict) class MegatronDistributedOptimizerMon(MixPrecsionOptimizerMon): def fetch_mv(self, monitor, torch_opt, params2name): mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer if not (hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt, "shard_fp32_from_float16_groups")): - raise Exception("megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, \ + raise Exception("megatron distributed optimizer should have model_float16_groups \ + and shard_fp32_from_float16_groups, \ if not, please check megatron-lm version") if not self.fp16_to_fp32_param and mix_prec_opt is not None: for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups, mix_prec_opt.shard_fp32_from_float16_groups): @@ -66,7 +69,7 @@ class MegatronDistributedOptimizerMon(MixPrecsionOptimizerMon): class DummyOptimizerMon(MixPrecsionOptimizerMon): def fetch_mv(self, monitor, torch_opt, params2name): - return None, None, None, None + return MVResult(exp_avg=None, exp_avg_sq=None, update=None, ratio=None) class OptimizerMonFactory: @@ -78,4 +81,5 @@ class OptimizerMonFactory: return MegatronDistributedOptimizerMon() if opt_ty is None or opt_ty == "unknown": return DummyOptimizerMon() - raise Exception("opt_ty should be Megatron_Float16OptimizerWithFloat16Params or Megatron_DistributedOptimizer or None or unknown") + raise Exception("opt_ty should be Megatron_Float16OptimizerWithFloat16Params or \ + Megatron_DistributedOptimizer or None or unknown") diff --git a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/cc_utils.py similarity index 94% rename from debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/cc_utils.py index aa1ff688ec1c417204fc067e636535fef8a35bc0..f31bb35681ffaf75116c595bdb81412c9f638529 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/cc_utils.py @@ -11,8 +11,8 @@ except: BACKEND = 'nccl' DEVICE = 'cuda' -from kj600.features import square_sum, get_max, get_min, get_zeros -from kj600.module_hook import CommunicationContext +from msmonitor.features import square_sum, get_max, get_min, get_zeros +from msmonitor.module_hook import CommunicationContext OP_FUNCS = { diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_basic_functions.json b/debug/accuracy_tools/msmonitor/msmonitor/unittest/config_basic_functions.json similarity index 100% rename from debug/accuracy_tools/kj600/kj600/unittest/config_basic_functions.json rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/config_basic_functions.json diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_cc.json b/debug/accuracy_tools/msmonitor/msmonitor/unittest/config_cc.json similarity index 100% rename from debug/accuracy_tools/kj600/kj600/unittest/config_cc.json rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/config_cc.json diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_cc_codeline_ranks.json b/debug/accuracy_tools/msmonitor/msmonitor/unittest/config_cc_codeline_ranks.json similarity index 49% rename from debug/accuracy_tools/kj600/kj600/unittest/config_cc_codeline_ranks.json rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/config_cc_codeline_ranks.json index 720fbb9dd0ee639a412c4a7e62b3a6a73fce227d..75192ffa791750735015ff234b49f82840cf22fc 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/config_cc_codeline_ranks.json +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/config_cc_codeline_ranks.json @@ -2,7 +2,7 @@ "targets": { "foo": {} }, - "cc_distribution": {"enable": true, "cc_codeline":["kj600/unittest/test_cc_codeline_ranks.py\\[19\\]"]}, + "cc_distribution": {"enable": true, "cc_codeline":["msmonitor/unittest/test_cc_codeline_ranks.py\\[19\\]"]}, "module_ranks": [1], "ops":["max","min","norm","zeros"] } \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_cc_logonly.json b/debug/accuracy_tools/msmonitor/msmonitor/unittest/config_cc_logonly.json similarity index 100% rename from debug/accuracy_tools/kj600/kj600/unittest/config_cc_logonly.json rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/config_cc_logonly.json diff --git a/debug/accuracy_tools/kj600/kj600/unittest/expected_cc_log.json b/debug/accuracy_tools/msmonitor/msmonitor/unittest/expected_cc_log.json similarity index 40% rename from debug/accuracy_tools/kj600/kj600/unittest/expected_cc_log.json rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/expected_cc_log.json index 8f2edd7ecdb373242f40ae938ca9a880a45e3264..f2992115208acc962d2c7b15f282c77770af1dcd 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/expected_cc_log.json +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/expected_cc_log.json @@ -3,8 +3,8 @@ [ "|torch.float32||", "0|1", - "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[18] test_all_gather", - "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[40] main", + "/home/jovyan/workspace/kj_dev/msmonitor/unittest/test_cc_log_only.py[18] test_all_gather", + "/home/jovyan/workspace/kj_dev/msmonitor/unittest/test_cc_log_only.py[40] main", "[1] " ] ], @@ -12,8 +12,8 @@ [ "torch.float32|||", "0|1", - "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[23] test_all_reduce", - "/home/jovyan/workspace/kj_dev/kj600/unittest/test_cc_log_only.py[41] main", + "/home/jovyan/workspace/kj_dev/msmonitor/unittest/test_cc_log_only.py[23] test_all_reduce", + "/home/jovyan/workspace/kj_dev/msmonitor/unittest/test_cc_log_only.py[41] main", "[1] " ] ] diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_anomaly_inform.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_anomaly_inform.py similarity index 96% rename from debug/accuracy_tools/kj600/kj600/unittest/test_anomaly_inform.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_anomaly_inform.py index d1b73d7e8aaf311eef57cf0ee4dd99a763166d66..cf69791ff641cfee31ac030fcd55eac6300295fd 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_anomaly_inform.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_anomaly_inform.py @@ -1,7 +1,7 @@ import uuid import unittest -from kj600.anomaly_inform import AnomalyInformFactory +from msmonitor.anomaly_inform import AnomalyInformFactory class TestAnomalyInform(unittest.TestCase): diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_basic_functions.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_basic_functions.py similarity index 93% rename from debug/accuracy_tools/kj600/kj600/unittest/test_basic_functions.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_basic_functions.py index b7cdd3385b575b231702411fb01ebca2b67613bf..972c31d16a99033b2fef6af1ac59e56f6d8f8524 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_basic_functions.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_basic_functions.py @@ -1,14 +1,14 @@ import unittest import shutil -import torch import json import os +import torch try: import torch_npu device = torch.device('npu:0') except ModuleNotFoundError: device = torch.device('cpu') -from kj600.module_hook import TrainerMon +from msmonitor.module_hook import TrainerMon from tensorboard.backend.event_processing.event_accumulator import EventAccumulator @@ -30,13 +30,13 @@ class ToyDataset(torch.utils.data.Dataset): def __getitem__(self, idx): return self.data[idx].to(device), self.labels[idx].to(device) def get_file_path(): - output_dir = os.environ.get("KJ600_OUTPUT_DIR") + output_dir = os.environ.get("MONITOR_OUTPUT_DIR") for root1, dirs, files in os.walk(output_dir): for root2, dir, file in os.walk(os.path.join(root1, dirs[-1])): return os.path.join(root2, file[0]) def get_config(): - os.environ["KJ600_OUTPUT_DIR"] = "./test_kj600_output" + os.environ["MONITOR_OUTPUT_DIR"] = "./test_monitor_output" with open("config_basic_functions.json", 'r') as file: config_test = json.load(file) return config_test @@ -55,7 +55,7 @@ def get_tensorbaord(event_file_path): return scalers_tag, images_tag def clean_output(): - folder_path = os.environ.get("KJ600_OUTPUT_DIR") + folder_path = os.environ.get("MONITOR_OUTPUT_DIR") if os.path.exists(folder_path): shutil.rmtree(folder_path) @@ -73,9 +73,9 @@ def train(): for (inputs, targets) in train_loader: optimizer.zero_grad() # inputs and param torch.float32 -> torch.float16 - inputs = inputs.half() + inputs = inputs for param in model.parameters(): - param.data = param.data.half() + param.data = param.data # outputs torch.float32 outputs = model(inputs) output = outputs[0] @@ -86,9 +86,9 @@ def train(): loss.backward() optimizer.step() -class TestKj600(unittest.TestCase): +class TestMsMonitor(unittest.TestCase): def __init__(self, method_name: str) -> None: - super(TestKj600, self).__init__(method_name) + super(TestMsMonitor, self).__init__(method_name) self.config_test = get_config() self.event_file_path = None self.scalers_tag = None diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_cc.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc.py similarity index 97% rename from debug/accuracy_tools/kj600/kj600/unittest/test_cc.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc.py index b5e92417a41e09539a00761e743670ba1b409ff7..1d3de49928ef0d76fa9186b95a99964cd20cd19a 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_cc.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc.py @@ -5,8 +5,8 @@ import torch from torch import nn from torch import distributed as dist import torch.multiprocessing as mp -from kj600.module_hook import TrainerMon -from kj600.unittest.cc_utils import * +from msmonitor.module_hook import TrainerMon +from msmonitor.unittest.cc_utils import DEVICE, assert_context, assert_equal, wrap_reset, ddp_setup DEBUG = False DIM = 2 @@ -231,7 +231,7 @@ def main(rank, world_size): steps = 2 net = Model() - monitor = TrainerMon("kj600/unittest/config_cc.json", opt_ty="Megatron_Float16OptimizerWithFloat16Params") + monitor = TrainerMon("msmonitor/unittest/config_cc.json", opt_ty="Megatron_Float16OptimizerWithFloat16Params") # monitor = None # monitor.hook_optimizer() # to enable tb optimizer = torch.optim.Adam(net.parameters()) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_codeline_ranks.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc_codeline_ranks.py similarity index 88% rename from debug/accuracy_tools/kj600/kj600/unittest/test_cc_codeline_ranks.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc_codeline_ranks.py index d635441e155736340648b31dc1eab3d61d03f2fd..94c60bfa7d34cf1396f605daf1d653617a3549dd 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_codeline_ranks.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc_codeline_ranks.py @@ -3,8 +3,8 @@ sys.path.append(".") import torch from torch import distributed as dist import torch.multiprocessing as mp -from kj600.module_hook import TrainerMon -from kj600.unittest.cc_utils import * +from msmonitor.module_hook import TrainerMon +from msmonitor.unittest.cc_utils import DEVICE, assert_context, assert_equal, wrap_reset, ddp_setup, Model @wrap_reset def test_all_gather(context, rank, target_rank, world_size, async_op): @@ -32,7 +32,7 @@ def main(rank, world_size): async_op = False net = Model() - monitor = TrainerMon("kj600/unittest/config_cc_codeline_ranks.json") + monitor = TrainerMon("msmonitor/unittest/config_cc_codeline_ranks.json") target_rank = monitor.module_rank_list # monitor = None # monitor.hook_optimizer() # to enable tb diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_log_only.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc_log_only.py similarity index 88% rename from debug/accuracy_tools/kj600/kj600/unittest/test_cc_log_only.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc_log_only.py index d7508d4af51d0549105a92eea3b7ff717924aea4..6108c153bdeca119ce5290d8c878582ef169da35 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_cc_log_only.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_cc_log_only.py @@ -5,8 +5,8 @@ import json import torch from torch import distributed as dist import torch.multiprocessing as mp -from kj600.module_hook import TrainerMon -from kj600.unittest.cc_utils import * +from msmonitor.module_hook import TrainerMon +from msmonitor.unittest.cc_utils import DEVICE, assert_context, assert_equal, wrap_reset, ddp_setup, Model with open(os.path.join(os.path.dirname(__file__), 'expected_cc_log.json')) as f: @@ -30,7 +30,7 @@ def main(rank, world_size): async_op = False net = Model() - monitor = TrainerMon("kj600/unittest/config_cc_logonly.json") + monitor = TrainerMon("msmonitor/unittest/config_cc_logonly.json") monitor.hook_optimizer() # to enable tb optimizer = torch.optim.Adam(net.parameters()) cc_context = monitor.cc_context diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_database.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_database.py similarity index 96% rename from debug/accuracy_tools/kj600/kj600/unittest/test_database.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_database.py index a9046d9c07ebb1cdf3f16490554f7223d893e51e..94103a8d58e61d839606ea655251c8067ec9f8f7 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_database.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_database.py @@ -5,7 +5,7 @@ from unittest import TestCase from sqlalchemy import inspect -from kj600.database import Database, ExceptionMessage +from msmonitor.database import Database, ExceptionMessage class TestDatabase(TestCase): diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_features.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_features.py similarity index 95% rename from debug/accuracy_tools/kj600/kj600/unittest/test_features.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_features.py index bc8c6dd71ab4e0bf708cf3d97d02dab3a2ded9cc..9d63358831ccf3c087eba0738c9bb0b561367643 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_features.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_features.py @@ -2,7 +2,7 @@ import unittest import torch import torch.nn as nn import torch_npu -from kj600.features import eff_rank +from msmonitor.features import eff_rank class TestFeatureCalculation(unittest.TestCase): diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_module_hook.py similarity index 88% rename from debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py rename to debug/accuracy_tools/msmonitor/msmonitor/unittest/test_module_hook.py index f81312691d35825fad05b7ed04db352bc96b2c20..75218c18a5c4397b2f471a886081c0452aa5083f 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/unittest/test_module_hook.py @@ -8,7 +8,7 @@ try: except ModuleNotFoundError: device = torch.device('cpu') import torch.nn.functional as F -from kj600.module_hook import TrainerMon # Modify PYTHONPATH to import TrainerMon +from msmonitor.module_hook import TrainerMon # Modify PYTHONPATH to import TrainerMon #from hook_api import reg_grad_hook, reg_grad_one_hook, reg_module_backward_hook, reg_module_forward_hook #from torch.cuda.amp import GradScaler @@ -23,7 +23,7 @@ from kj600.module_hook import TrainerMon # Modify PYTHONPATH to import TrainerMo #debugger = PD(dump_path="./dump/", hook_name="dump", step=[1, 2, 3], enable_dataloader=False) #debugger.configure_hook(mode="list", scope=["optim_Adam_step"], ) -parser = argparse.ArgumentParser(prog="kj600 debug", description="kj600 sample code", epilog="") +parser = argparse.ArgumentParser(prog="msmonitor debug", description="msmonitor sample code", epilog="") parser.add_argument("-o", "--out_dir", type=str, default=".") args = parser.parse_args() DTYPE = torch.float32 @@ -54,7 +54,7 @@ config = { # reg_module_backward_hook(net, module_bwd_hook, config) optimizer = torch.optim.Adam(net.parameters(), lr=0.0001) -hooker = TrainerMon('./kj600/unittest/config_1.json', opt_ty = 'Megatron_Float16OptimizerWithFloat16Params') +hooker = TrainerMon('./msmonitor/unittest/config_1.json', opt_ty = 'Megatron_Float16OptimizerWithFloat16Params') hooker.hook_modules(model=net, global_batch_size=2, dp=1, micro_batch_size=2, fwd_or_bkd=0, params_have_main_grad=False) # hooker.hook_optimizer(optimizer) diff --git a/debug/accuracy_tools/kj600/kj600/utils.py b/debug/accuracy_tools/msmonitor/msmonitor/utils.py similarity index 98% rename from debug/accuracy_tools/kj600/kj600/utils.py rename to debug/accuracy_tools/msmonitor/msmonitor/utils.py index b8048a2c06dc9f72023ca00b915775367985bd80..d8e6cfc3e1b1a47ad6d335ca1f7e988589868c60 100644 --- a/debug/accuracy_tools/kj600/kj600/utils.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/utils.py @@ -5,7 +5,7 @@ import re from functools import wraps from torch import distributed as dist -from kj600.const import Const +from msmonitor.const import Const FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024 FILE_NAME_MAX_LENGTH = 255 @@ -65,6 +65,7 @@ class FileCheckConst: YAML_SUFFIX: MAX_YAML_SIZE } + class FileCheckException(Exception): """ Class for File Check Exception @@ -131,6 +132,7 @@ def print_warn_log(warn_msg): """ _print_log("WARNING", warn_msg) + def get_param_struct(param): if isinstance(param, tuple): return f"tuple[{len(param)}]" @@ -138,6 +140,7 @@ def get_param_struct(param): return f"list[{len(param)}]" return "tensor" + def check_link(path): abs_path = os.path.abspath(path) if os.path.islink(abs_path): @@ -220,7 +223,7 @@ def validate_ops(ops): return valid_ops def validate_ranks(ranks): - world_size = dist.get_world_size() + world_size = dist.get_world_size() if dist.is_initialized() else 1 if not isinstance(ranks, list): raise TypeError("module_ranks should be a list") for rank in ranks: diff --git a/debug/accuracy_tools/kj600/kj600/visualizer.py b/debug/accuracy_tools/msmonitor/msmonitor/visualizer.py similarity index 71% rename from debug/accuracy_tools/kj600/kj600/visualizer.py rename to debug/accuracy_tools/msmonitor/msmonitor/visualizer.py index e1929bfa3fb338b1cb66cda80a128e83176bfcbf..53df397923f49684f3a506ec9553dc5e94edc61b 100644 --- a/debug/accuracy_tools/kj600/kj600/visualizer.py +++ b/debug/accuracy_tools/msmonitor/msmonitor/visualizer.py @@ -1,7 +1,7 @@ import torch import numpy as np import matplotlib.pyplot as plt -from kj600.features import cal_histc +from msmonitor.features import cal_histc class HeatmapVisualizer: @@ -15,7 +15,10 @@ class HeatmapVisualizer: self.histogram_edges = torch.linspace(self.min_val, self.max_val, self.histogram_bins_num) def pre_cal(self, tensor): - self.cur_step_histogram_data = cal_histc(tensor_cal=tensor, bins_total=self.histogram_bins_num, min_val=self.min_val, max_val=self.max_val) + self.cur_step_histogram_data = cal_histc(tensor_cal=tensor, + bins_total=self.histogram_bins_num, + min_val=self.min_val, + max_val=self.max_val) def visualize(self, tag_name:str, step, summary_writer): if self.histogram_sum_data_np is None or self.histogram_sum_data_np.size == 0: @@ -23,13 +26,16 @@ class HeatmapVisualizer: else: # add new data along a different axis because we transposed early # matrix shape is [bins_num * total_step] - self.histogram_sum_data_np = np.concatenate((self.histogram_sum_data_np, np.expand_dims(self.cur_step_histogram_data.cpu(), 1)), axis=1) + self.histogram_sum_data_np = np.concatenate((self.histogram_sum_data_np, + np.expand_dims(self.cur_step_histogram_data.cpu(), 1)), + axis=1) fig, ax = plt.subplots() cax = ax.matshow(self.histogram_sum_data_np, cmap='hot', aspect='auto') fig.colorbar(cax) - plt.yticks(ticks=range(self.histogram_bins_num), labels=[f'{self.histogram_edges[i]:.2f}' for i in range(self.histogram_bins_num)]) + plt.yticks(ticks=range(self.histogram_bins_num), + labels=[f'{self.histogram_edges[i]:.2f}' for i in range(self.histogram_bins_num)]) ax.set_xlabel('Step') ax.set_ylabel('Value Range') plt.title(f'Total Step: {step}') diff --git a/debug/accuracy_tools/kj600/pyproject.toml b/debug/accuracy_tools/msmonitor/pyproject.toml similarity index 94% rename from debug/accuracy_tools/kj600/pyproject.toml rename to debug/accuracy_tools/msmonitor/pyproject.toml index 5df968563345dd07ed477ec73b967b63c6e812a6..e9f899d90b50963e2e0b0afe563d16c490fe8773 100644 --- a/debug/accuracy_tools/kj600/pyproject.toml +++ b/debug/accuracy_tools/msmonitor/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] -name = "kj600" +name = "msmonitor" version = "0.0.1" dependencies = [ "torch", diff --git "a/debug/accuracy_tools/kj600/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md" "b/debug/accuracy_tools/msmonitor/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md" similarity index 100% rename from "debug/accuracy_tools/kj600/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md" rename to "debug/accuracy_tools/msmonitor/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md"