From c77fd1099a8cd58f9e6d03c751450295eeb943f0 Mon Sep 17 00:00:00 2001 From: qijie Date: Wed, 8 May 2024 06:56:52 +0000 Subject: [PATCH 1/6] support g/m direction compare --- .../accuracy_tools/kj600/kj600/module_hook.py | 21 ++++++++++++++++--- .../kj600/kj600/optimizer_collect.py | 9 +++++--- .../kj600/kj600/unittest/config_1.json | 4 +++- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 76c47a7ed..eb0f65f1c 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -48,7 +48,9 @@ class OptimizerContext: def __init__(self) -> None: self.step = 0 self.param_gnorm = defaultdict(float) + self.param_gsign = defaultdict(int) self.param_exp_avg_norm = defaultdict(float) + self.param_exp_avg_sign = defaultdict(int) self.param_exp_avg_sq_norm = defaultdict(float) self.param_effective_rank = defaultdict(float) self.param_adam_update = defaultdict() @@ -69,6 +71,7 @@ class TrainerMon: self.config = get_config(config_file_path) self.module_rank_list = [int(rank) for rank in self.config.get("module_ranks", "").split(',') if rank.strip()] self.ur_distribution = self.config.get('ur_distribution', False) + self.mg_direction = self.config.get('mg_direction', False) self.optimizer_hooked = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') @@ -137,7 +140,7 @@ class TrainerMon: context.verified = True if not context.ignore_in: cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col] - cared_input_grad_cal_result = square_sum(cared_input_grad) + cared_input_grad_cal_result = square_sum(cared_input_grad) if cared_input_grad is not None else torch.tensor(0.) else: cared_input_grad_cal_result = None cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col] @@ -191,14 +194,26 @@ class TrainerMon: # in DDP by default use params_have_main_grad def optimizer_pre_step_hook(optimizer, args, kwargs): context = self.optimizer_context[optimizer] + rank = dist.get_rank() if dist.is_initialized() else None + + context.param_exp_avg_norm, context.param_exp_avg_sign, context.param_exp_avg_sq_norm, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv( + optimizer, self.param2name, self.update_heatmap_visualizer, self.ratio_heatmap_visualizer, self.ur_distribution, self.mg_direction) + for param, name in self.param2name.items(): grad_for_norm = param.main_grad if self.params_have_main_grad else param.grad context.param_gnorm[name] = grad_for_norm.detach().norm() if "params_effrank" in self.config and name in self.config["params_effrank"]: context.param_effective_rank[name] = eff_rank(param.detach()) - context.param_exp_avg_norm, context.param_exp_avg_sq_norm, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv( - optimizer, self.param2name, self.update_heatmap_visualizer, self.ratio_heatmap_visualizer, self.ur_distribution) + if self.mg_direction: + if context.step == 0: + self.summary_writer.add_scalar(get_summary_writer_tag_name(name, 'adam_mg_direction', rank), 1, context.step) + continue + g_sign = grad_for_norm.detach().sign() + m_sign = context.param_exp_avg_sign[name] + same_direction_ratio = ((m_sign * g_sign).sum().item()/m_sign.numel() + 1)/2 + self.summary_writer.add_scalar(get_summary_writer_tag_name(name, 'adam_mg_direction', rank), same_direction_ratio, context.step) + return def optimizer_post_step_hook(optimizer, args, kwargs): diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py index 962881497..44f478416 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py @@ -25,7 +25,7 @@ class MixPrecsionOptimizerMon: # parameter tensors we want to monitor and their names are in params2name_dict # base_optimizer is pytorch optimizer, wrapped_optimizer is a normal object with base_optimizer - def fetch_mv(self, torch_opt, params2name, update_heatmap_visualizer, ratio_heatmap_visualizer, ur_distribution): + def fetch_mv(self, torch_opt, params2name, update_heatmap_visualizer, ratio_heatmap_visualizer, ur_distribution, mg_direction): mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer if not self.fp16_to_fp32_param and mix_prec_opt is not None: @@ -34,6 +34,7 @@ class MixPrecsionOptimizerMon: self.fp16_to_fp32_param[fp16_param] = fp32_param exp_avg_norm_dict = defaultdict(float) + exp_avg_sign_dict = defaultdict(int) exp_avg_sq_norm_dict = defaultdict(float) update_dict = defaultdict() ratio_dict = defaultdict() @@ -49,10 +50,12 @@ class MixPrecsionOptimizerMon: exp_avg_sq_norm = exp_avg_sq.detach().norm() exp_avg_norm_dict[name] = exp_avg_norm exp_avg_sq_norm_dict[name] = exp_avg_sq_norm + if mg_direction: + exp_avg_sign_dict[name] = exp_avg.detach().sign() if ur_distribution: update_dict[name] = exp_avg / (torch.sqrt(exp_avg_sq) + torch_opt.defaults['eps']) ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq) update_heatmap_visualizer[name].pre_cal(update_dict[name]) ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) - - return exp_avg_norm_dict, exp_avg_sq_norm_dict, update_dict, ratio_dict + + return exp_avg_norm_dict, exp_avg_sign_dict, exp_avg_sq_norm_dict, update_dict, ratio_dict diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_1.json b/debug/accuracy_tools/kj600/kj600/unittest/config_1.json index fc6196fc1..a3b10f731 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/config_1.json +++ b/debug/accuracy_tools/kj600/kj600/unittest/config_1.json @@ -2,5 +2,7 @@ "targets": { "fc": {"input": "tuple[1]:0", "output": "tensor", "input_grad":"tuple[1]:0", "output_grad":"tuple[1]:0"}, "relu": {"input": "tuple[1]:0", "output": "tensor", "input_grad":"tuple[1]:0", "output_grad":"tuple[1]:0"} - } + }, + "ur_distribution": true, + "mg_direction": true } \ No newline at end of file -- Gitee From 54fbfc907e26a4c7a874deaee152a62aa974ad96 Mon Sep 17 00:00:00 2001 From: qijie Date: Wed, 8 May 2024 07:52:44 +0000 Subject: [PATCH 2/6] update tb write position --- .../accuracy_tools/kj600/kj600/module_hook.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index eb0f65f1c..17e04a3e5 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -48,9 +48,9 @@ class OptimizerContext: def __init__(self) -> None: self.step = 0 self.param_gnorm = defaultdict(float) - self.param_gsign = defaultdict(int) self.param_exp_avg_norm = defaultdict(float) self.param_exp_avg_sign = defaultdict(int) + self.param_mg_direction = defaultdict(float) self.param_exp_avg_sq_norm = defaultdict(float) self.param_effective_rank = defaultdict(float) self.param_adam_update = defaultdict() @@ -209,11 +209,14 @@ class TrainerMon: if context.step == 0: self.summary_writer.add_scalar(get_summary_writer_tag_name(name, 'adam_mg_direction', rank), 1, context.step) continue - g_sign = grad_for_norm.detach().sign() - m_sign = context.param_exp_avg_sign[name] - same_direction_ratio = ((m_sign * g_sign).sum().item()/m_sign.numel() + 1)/2 - self.summary_writer.add_scalar(get_summary_writer_tag_name(name, 'adam_mg_direction', rank), same_direction_ratio, context.step) - + if name in context.param_exp_avg_sign: + g_sign = grad_for_norm.detach().sign() + m_sign = context.param_exp_avg_sign.pop(name) + same_direction_ratio = ((m_sign * g_sign).sum().item()/m_sign.numel() + 1)/2 + else: + same_direction_ratio = 1 + context.param_mg_direction[name] = same_direction_ratio + # return def optimizer_post_step_hook(optimizer, args, kwargs): @@ -251,6 +254,9 @@ class TrainerMon: self.update_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_update', rank), context.step, self.summary_writer) for param_name, _ in context.param_adam_ratio.items(): self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer) + if self.mg_direction: + for param_name, mg_direction in context.param_gm_direction.items(): + self.summary_writer.add_scalar(get_summary_writer_tag_name(param_name, 'adam_mg_direction', rank), mg_direction.item(), context.step) context.step += 1 return -- Gitee From 9f29d07f9385332816c11e100335707281beac02 Mon Sep 17 00:00:00 2001 From: qijie Date: Wed, 8 May 2024 07:58:23 +0000 Subject: [PATCH 3/6] fix typo --- debug/accuracy_tools/kj600/kj600/module_hook.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 17e04a3e5..d9b8c3562 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -206,9 +206,6 @@ class TrainerMon: context.param_effective_rank[name] = eff_rank(param.detach()) if self.mg_direction: - if context.step == 0: - self.summary_writer.add_scalar(get_summary_writer_tag_name(name, 'adam_mg_direction', rank), 1, context.step) - continue if name in context.param_exp_avg_sign: g_sign = grad_for_norm.detach().sign() m_sign = context.param_exp_avg_sign.pop(name) @@ -255,8 +252,8 @@ class TrainerMon: for param_name, _ in context.param_adam_ratio.items(): self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer) if self.mg_direction: - for param_name, mg_direction in context.param_gm_direction.items(): - self.summary_writer.add_scalar(get_summary_writer_tag_name(param_name, 'adam_mg_direction', rank), mg_direction.item(), context.step) + for param_name, mg_direction in context.param_mg_direction.items(): + self.summary_writer.add_scalar(get_summary_writer_tag_name(param_name, 'adam_mg_direction', rank), mg_direction, context.step) context.step += 1 return -- Gitee From 3215f0846edaee7feb885e0634ed31543ca8161b Mon Sep 17 00:00:00 2001 From: qijie Date: Wed, 8 May 2024 08:09:15 +0000 Subject: [PATCH 4/6] update README --- debug/accuracy_tools/kj600/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index d03b9606e..05fcb4a21 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -64,6 +64,8 @@ pip install -e . "ur_distribution": 可选字段,若为true则会统计adam优化器的update和ratio的数值分布,并展示在heatmap里,默认为false。 +"mg_direction": 可选字段,若为true则会统计adam优化器的动量与当前梯度方向一致的参数比例。 + 下面给出transformer架构模型中常见的module的前向计算的输入输出和反向计算输入张量的梯度和输出张量的梯度格式,以供参考: | module | input | output | input_grad | output_grad | -- Gitee From 3e634c334b0674706ca0753bb30781f025c6e1b9 Mon Sep 17 00:00:00 2001 From: qijie Date: Wed, 8 May 2024 08:17:40 +0000 Subject: [PATCH 5/6] clean code --- debug/accuracy_tools/kj600/kj600/module_hook.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index d9b8c3562..881351bc8 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -194,7 +194,6 @@ class TrainerMon: # in DDP by default use params_have_main_grad def optimizer_pre_step_hook(optimizer, args, kwargs): context = self.optimizer_context[optimizer] - rank = dist.get_rank() if dist.is_initialized() else None context.param_exp_avg_norm, context.param_exp_avg_sign, context.param_exp_avg_sq_norm, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv( optimizer, self.param2name, self.update_heatmap_visualizer, self.ratio_heatmap_visualizer, self.ur_distribution, self.mg_direction) @@ -213,7 +212,7 @@ class TrainerMon: else: same_direction_ratio = 1 context.param_mg_direction[name] = same_direction_ratio - # + return def optimizer_post_step_hook(optimizer, args, kwargs): -- Gitee From 97ba5006b0c7c25b1e54b995e853b5479da47e11 Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 13 May 2024 12:28:57 +0000 Subject: [PATCH 6/6] add annotation --- .../accuracy_tools/kj600/kj600/module_hook.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 881351bc8..233b000f8 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -47,14 +47,14 @@ class ModuleHookContext: class OptimizerContext: def __init__(self) -> None: self.step = 0 - self.param_gnorm = defaultdict(float) - self.param_exp_avg_norm = defaultdict(float) - self.param_exp_avg_sign = defaultdict(int) - self.param_mg_direction = defaultdict(float) - self.param_exp_avg_sq_norm = defaultdict(float) - self.param_effective_rank = defaultdict(float) - self.param_adam_update = defaultdict() - self.param_adam_ratio = defaultdict() + self.param_gnorm = defaultdict(float) # norm of grad + self.param_exp_avg_norm = defaultdict(float) # norm of expection of gradient average (m_{t-1}) + self.param_exp_avg_sign = defaultdict(int) # sign of expection of gradient average (m_{t-1}) + self.param_mg_direction = defaultdict(float) # ratio of parameters in same direction between g_{t} and m_{t-1} + self.param_exp_avg_sq_norm = defaultdict(float) # norm of expection of gradient square (v_{t-1}) + self.param_effective_rank = defaultdict(float) # ratio of parameters above a threshold + self.param_adam_update = defaultdict() # distribution of update (m_t/(v_t**0.5+eps)) + self.param_adam_ratio = defaultdict() # distribution of ratio (m_t/v_t**0.5) class TrainerMon: @@ -199,14 +199,14 @@ class TrainerMon: optimizer, self.param2name, self.update_heatmap_visualizer, self.ratio_heatmap_visualizer, self.ur_distribution, self.mg_direction) for param, name in self.param2name.items(): - grad_for_norm = param.main_grad if self.params_have_main_grad else param.grad - context.param_gnorm[name] = grad_for_norm.detach().norm() + grad = param.main_grad if self.params_have_main_grad else param.grad + context.param_gnorm[name] = grad.detach().norm() if "params_effrank" in self.config and name in self.config["params_effrank"]: context.param_effective_rank[name] = eff_rank(param.detach()) if self.mg_direction: if name in context.param_exp_avg_sign: - g_sign = grad_for_norm.detach().sign() + g_sign = grad.detach().sign() m_sign = context.param_exp_avg_sign.pop(name) same_direction_ratio = ((m_sign * g_sign).sum().item()/m_sign.numel() + 1)/2 else: -- Gitee