From c2eddf12216fa780a9e63fc0fa5a1028af1a977a Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 24 Jul 2024 04:48:55 +0000 Subject: [PATCH 01/94] update readme --- debug/accuracy_tools/kj600/README.md | 47 ++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index bd97acf6dc9..6acf189bbae 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -34,7 +34,50 @@ cd kj600 pip install . ``` -# 快速上手 +## 快速上手 +### 梯度监控 +模型训练状态的异常通常会反映在loss和梯度上,通过对模型各个模块梯度的监控,可以帮助快速定位异常的第一现场。 + + +1. 配置tensorboard写入的目录 +监控结果写入tensorboard的event文件中,设置输出路径(默认为`kj600_output`) + +```bash +export KJ600_OUTPUT_DIR=/xxx/output_dir +tensorboard --logdir=$KJ600_OUTPUT_DIR +``` + +2. 在训练脚本中使能工具 + +``` +from kj600.module_hook import TrainerMon + +model, optimizer, opt_param_scheduler = setup_model_and_optimizer( + model_provider, model_type) +# 模型初始化后插入工具代码 +hooker = TrainerMon("./monitor_config.json", params_have_main_grad=True) +hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size) +``` + +3. 在json文件中配置工具 +``` +{ + "targets": { + "module.language_model.encoder.layers.0": {} + }, + "print_struct": false, # 若不了解模型结构,可以打开print_struct打印模型结构 + "module_ranks": [0,1,2,3], # 需要监控的rank + "wg_distribution": true, + "alert": { + "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}] + }, + "ops": ["norm"], + "eps": 1e-8 +} +``` + + +## 详细配置 下面以Ascend/ModelLink训练框架为例,给出kj600工具的使用方法。 @@ -79,7 +122,7 @@ pip install . |"mv_distribution"| 可选 | 若为true则会监控指定模块中的参数的优化器状态, 默认为false。需要在TrainerMon构造函数正确指定opt_ty. 目前只支持megatron的混合精度优化器以及megatron的分布式优化器。 Deepspeed的分布式优化器实现暂不支持。 | |"wg_distribution"| 可选 | 若为true则会监控指定模块的参数梯度, 默认为false。 | |"alert"| 必选 | · "rules": 指定自动报警的异常检测机制及其相应的阈值。目前实现的异常检测是AnomalyTurbulence。 如果统计标量超出历史均值的指定浮动范围(threshold指定, 0.5意味着上浮或者下浮50%)则在控制台打印报警信息。
· "inform": 自动报警需要的配置,若想关闭自动报警删掉inform的配置即可。其中"recipient"指定自动报警的通知方式,可选值为"database"或"email",默认为"database"。
- 若"recipient"为"database",则需要指定"connection_str"字段,即数据库的连接URL,默认为{"recipient":"database", "connection_str": "mysql+pymysql://username:password@host:port/database"},若有特殊字符需要转义。
- 若"recipient"为"email",则需要指定"send_email_address"-发送方邮箱地址,"receive_email_address"-接收方邮箱地址,"send_email_username"-发送方邮箱用户名,"send_email_password"-发送方邮箱密码,"smtp_server"-发送方邮箱对应的SMTP服务器,"smtp_port"-发送方邮箱对应的SMTP端口号。默认为:
{"recipient":"email", send_email_address": "sender@huawei.com", "receive_email_address": "receiver@huawei.com", "send_email_username": "username", "send_email_password": "******", "smtp_server": "smtpscn.huawei.com", "smtp_port": "587"}| -|"cc_distribution"| 可选 | 其中“enable”字段控制开关;需要监控通信算子时,务必尽量早地实例化`TrainerMon`, 因为监控通过劫持原始func后挂hook实现,部分加速库初始化时会保存原始function,避免监控失效。“cc_codeline”字段指定监控的代码行,如:`train.py\\[23\\]`,默认为空列表,不特别指定;"cc_pre_hook"字段控制是否监控通信前的数据; "cc_log_only"为true时,仅记录调用到的算子及其调用栈, 不监控通信的输入输出| +|"cc_distribution"| 可选 | 其中"enable"字段控制通信监控模块的开关;需要监控通信算子时,务必尽量早地实例化`TrainerMon`, 因为监控通过劫持原始func后挂hook实现,部分加速库初始化时会保存原始function,避免监控失效。"cc_codeline"字段指定监控的代码行,如:`train.py\\[23\\]`,默认为空列表,不特别指定;"cc_pre_hook"字段控制是否监控通信前的数据; 模块会在第二个optimize.step之前打印通信日志,包括通信api的调用栈、输入dtype、通信group。 "cc_log_only"为true时,仅打印日志,不监控通信的输入输出,并在打印后中断训练。可以根据通信日志设置"cc_codeline",规避与训练过程不相关的通信,比如一些时间、metrics的同步。| |"ops"| 可选 |与ur_distribution、xy_distribution、mv_distribution、wg_distribution、mg_direction、cc_distribution配合,监控所选张量的min、max、norm、zeros值。其中,zeros代表监控所选张量的元素小于eps的比例,id代表监控所选的非张量本身,默认为[]。| |"eps"| 可选 |若ops里包含"zeros"则需要配置,默认为1e-8。| -- Gitee From c28483f512b857abdd789cf1f3e8322448a0b5c7 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 24 Jul 2024 04:57:57 +0000 Subject: [PATCH 02/94] monitor grad --- .../accuracy_tools/kj600/kj600/module_hook.py | 79 ++++++++++++++++++- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 8043c5671c4..c2a38dc0433 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -2,6 +2,7 @@ import os import uuid import json from collections import defaultdict +from functools import partial from datetime import datetime import torch import torch.distributed as dist @@ -16,6 +17,11 @@ from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summ from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate from kj600.utils import print_warn_log, print_info_log, get_param_struct +if torch.cuda.is_available(): + DEVICE = 'cuda' +elif hasattr(torch, 'npu') and torch.npu.is_available(): + DEVICE = 'npu' + class ModuleHookContext: def __init__(self, module_name) -> None: @@ -71,6 +77,19 @@ class CommunicationContext: def aggregate(self): self.data = self._agg(self.data) + +class GradContext: + def __init__(self) -> None: + self.pre = [] + self.post = [] + self.zeros_like = None + + def reset(self): + self.pre.clear() + self.post.clear() + self.grad_acc.fill_(0.) + + class TrainerMon: tensor_metrics = TensorMetrics() @@ -81,6 +100,7 @@ class TrainerMon: self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext) self.optimizer_context = defaultdict(OptimizerContext) self.cc_context = defaultdict(CommunicationContext) + self.grad_context = defaultdict(GradContext) self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) self.module_rank_list = self.config.get("module_ranks", []) @@ -198,6 +218,11 @@ class TrainerMon: self.hook_optimizer() return + def monitor_gnorm_with_ad(self, model, grad_acc_steps): + self._hook_weights(model) + self.hook_optimizer() + self.micro_batch_number = grad_acc_steps + def build_tbtag_tensor_map(self, module_name, tag, tensor): metrics = {} rank = dist.get_rank() if dist.is_initialized() else None @@ -246,6 +271,18 @@ class TrainerMon: write_metrics_tensorboard(metric_name, self.summary_writer, bwd_context.actvgrad, step) bwd_context.actvgrad.clear() + def write_grad_tb(self, step): + if not self.wg_distribution: + return + + for name in self.param2name.values(): + context = self.grad_context[name] + + write_metrics_tensorboard('norm', self.summary_writer, context.pre, step) + write_metrics_tensorboard('norm', self.summary_writer, context.post, step) + + context.reset() + def hook_optimizer(self): # in DDP by default use params_have_main_grad def optimizer_pre_step_hook(optimizer, args, kwargs): @@ -264,6 +301,7 @@ class TrainerMon: context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name) + rank = dist.get_rank() if dist.is_initialized() else None for param, name in self.param2name.items(): if "params_effrank" in self.config and name in self.config["params_effrank"]: context.param_effective_rank[name] = eff_rank(param.detach()) @@ -272,8 +310,19 @@ class TrainerMon: print_warn_log(f"grad is None: {name}, maybe something wrong happened.") continue if self.wg_distribution: - context.param_weight_grad[name] = grad - if self.mg_direction: + metric_dict = {} + key = get_summary_writer_tag_name(name, 'post_grad', rank) + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, {key: grad}, self.eps) + self.grad_context[name].post.append(metric_dict) + + metric_dict = {} + key = get_summary_writer_tag_name(name, 'pre_grad', rank) + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, {key: self.grad_context[name].grad_acc}, self.eps) + self.grad_context[name].pre.append(metric_dict) + + if self.mg_direction: if context.step == 0: same_direction_ratio = torch.tensor(1.) else: @@ -281,8 +330,6 @@ class TrainerMon: context.param_mg_direction[name] = same_direction_ratio tbtag_tensor_map = {} - if self.wg_distribution: - tbtag_tensor_map.update(self.generate_param_metrics('weight_grad', context.param_weight_grad)) if self.mv_distribution: tbtag_tensor_map.update(self.generate_param_metrics('exp_avg', context.param_exp_avg)) tbtag_tensor_map.update(self.generate_param_metrics('exp_avg_sq', context.param_exp_avg_sq)) @@ -308,6 +355,7 @@ class TrainerMon: rank = dist.get_rank() if dist.is_initialized() else None self.write_xy_tb(context.step) + self.write_grad_tb(context.step) self.write_adhoc_check(context.step) if self.ur_distribution: @@ -432,3 +480,26 @@ class TrainerMon: print_rank_0(f"> {name} is monitored successfully") hooked_count += 1 return hooked_count + + def _hook_weights(self, model): + self.wg_distribution = True + rank = dist.get_rank() if dist.is_initialized() else None + + def param_hook(grad, context): + with torch.no_grad(): + context.grad_acc += grad + + if self.print_struct: + self.module_struct = { + module_name: 1. for module_name, module in model.named_modules()} + return + + for name, param in model.named_parameters(): + for target in self.config['targets'].keys(): + context = self.grad_context[name] + if name.startswith(target) and param.requires_grad: + self._smallest_rank_print(f'>> monitoring: {name}') + self.param2name[param] = name + param.register_hook( + partial(param_hook, context=context)) + context.grad_acc = torch.zeros_like(param).to(DEVICE) -- Gitee From 000059bbca5813139b6b7091d48c1f0e3a63fd38 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 24 Jul 2024 05:20:47 +0000 Subject: [PATCH 03/94] adapt device --- debug/accuracy_tools/kj600/kj600/module_hook.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index c2a38dc0433..bb92b292a9d 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -17,11 +17,12 @@ from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summ from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate from kj600.utils import print_warn_log, print_info_log, get_param_struct -if torch.cuda.is_available(): - DEVICE = 'cuda' -elif hasattr(torch, 'npu') and torch.npu.is_available(): - DEVICE = 'npu' +try: + import torch_npu + DEVICE = 'npu' +except: + DEVICE= 'cuda' class ModuleHookContext: def __init__(self, module_name) -> None: @@ -82,7 +83,7 @@ class GradContext: def __init__(self) -> None: self.pre = [] self.post = [] - self.zeros_like = None + self.grad_acc = None def reset(self): self.pre.clear() @@ -500,6 +501,5 @@ class TrainerMon: if name.startswith(target) and param.requires_grad: self._smallest_rank_print(f'>> monitoring: {name}') self.param2name[param] = name - param.register_hook( - partial(param_hook, context=context)) + param.register_hook(partial(param_hook, context=context)) context.grad_acc = torch.zeros_like(param).to(DEVICE) -- Gitee From 51daebd8c943ac93e156cc709f671b82118b581e Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 24 Jul 2024 05:45:05 +0000 Subject: [PATCH 04/94] specify exception --- debug/accuracy_tools/kj600/kj600/module_hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index bb92b292a9d..ceda08e52ec 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -21,7 +21,7 @@ from kj600.utils import print_warn_log, print_info_log, get_param_struct try: import torch_npu DEVICE = 'npu' -except: +except ImportError: DEVICE= 'cuda' class ModuleHookContext: -- Gitee From 6c0451b94efe530e255bf4a31fdafd716f95e382 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 24 Jul 2024 06:41:01 +0000 Subject: [PATCH 05/94] adapt ops for grad monitor --- debug/accuracy_tools/kj600/kj600/module_hook.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index ceda08e52ec..d13736aeb20 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -278,9 +278,9 @@ class TrainerMon: for name in self.param2name.values(): context = self.grad_context[name] - - write_metrics_tensorboard('norm', self.summary_writer, context.pre, step) - write_metrics_tensorboard('norm', self.summary_writer, context.post, step) + for metric_name in self.ops: + write_metrics_tensorboard(metric_name, self.summary_writer, context.pre, step) + write_metrics_tensorboard(metric_name, self.summary_writer, context.post, step) context.reset() -- Gitee From 6477f296135bd8d1817b5a1ceaeaa86c551c965e Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 26 Jul 2024 07:25:59 +0000 Subject: [PATCH 06/94] support vpp --- .../accuracy_tools/kj600/kj600/module_hook.py | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index d13736aeb20..6d1b25023e5 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -147,6 +147,7 @@ class TrainerMon: anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None self.optimizer_hooked = False + self.vpp = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] @@ -490,16 +491,33 @@ class TrainerMon: with torch.no_grad(): context.grad_acc += grad + def register_hooks(model_chunk, prefix=''): + for param_name, param in model_chunk.named_parameters(): + name = prefix + param_name + for target in self.config['targets'].keys(): + context = self.grad_context[name] + if param_name.startswith(target) and param.requires_grad: + self._smallest_rank_print(f'>> monitoring: {name}') + self.param2name[param] = name + param.register_hook(partial(param_hook, context=context)) + context.grad_acc = torch.zeros_like(param).to(DEVICE) + + if self.print_struct: self.module_struct = { module_name: 1. for module_name, module in model.named_modules()} return - for name, param in model.named_parameters(): - for target in self.config['targets'].keys(): - context = self.grad_context[name] - if name.startswith(target) and param.requires_grad: - self._smallest_rank_print(f'>> monitoring: {name}') - self.param2name[param] = name - param.register_hook(partial(param_hook, context=context)) - context.grad_acc = torch.zeros_like(param).to(DEVICE) + if isinstance(model, list): + if len(model) > 1: + self.vpp = True + self._smallest_rank_print('vpp enabled') + + for vpp_stage, model_chunk in enumerate(model): + prefix = f'{vpp_stage}_' if self.vpp else '' + register_hooks(model_chunk, prefix=prefix) + + else: + register_hooks(model) + + -- Gitee From 9ad80a6b9007d212071bf9d54836c4e7002eac8e Mon Sep 17 00:00:00 2001 From: wuyulong11 <2284273586@qq.com> Date: Wed, 31 Jul 2024 17:15:17 +0800 Subject: [PATCH 07/94] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=9C=AA=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E7=9A=84libkineto=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../.github/workflows/libkineto_ci.yml | 56 -- .../workflows/tb_plugin_build_pip_package.yml | 19 - .../.github/workflows/tb_plugin_ci.yml | 57 -- plugins/tensorboard-plugins/.gitignore | 3 - plugins/tensorboard-plugins/.gitmodules | 6 - .../tensorboard-plugins/CODE_OF_CONDUCT.md | 77 -- plugins/tensorboard-plugins/CONTRIBUTING.md | 34 - plugins/tensorboard-plugins/LICENSE | 33 - plugins/tensorboard-plugins/README.md | 38 - .../libkineto/CMakeLists.txt | 198 ----- .../tensorboard-plugins/libkineto/README.md | 65 -- .../libkineto/include/AbstractConfig.h | 113 --- .../include/ActivityProfilerInterface.h | 91 -- .../include/ActivityTraceInterface.h | 21 - .../libkineto/include/ActivityType.h | 34 - .../libkineto/include/ClientInterface.h | 16 - .../libkineto/include/Config.h | 433 --------- .../libkineto/include/GenericTraceActivity.h | 125 --- .../libkineto/include/IActivityProfiler.h | 104 --- .../libkineto/include/ILoggerObserver.h | 50 -- .../libkineto/include/ITraceActivity.h | 53 -- .../libkineto/include/ThreadUtil.h | 22 - .../libkineto/include/TraceSpan.h | 36 - .../libkineto/include/libkineto.h | 138 --- .../libkineto/include/time_since_epoch.h | 16 - .../libkineto/libkineto_defs.bzl | 77 -- .../sample_programs/kineto_playground.cpp | 38 - .../sample_programs/kineto_playground.cu | 60 -- .../sample_programs/kineto_playground.cuh | 18 - .../libkineto/src/AbstractConfig.cpp | 188 ---- .../libkineto/src/ActivityBuffers.h | 29 - .../libkineto/src/ActivityLoggerFactory.h | 60 -- .../src/ActivityProfilerController.cpp | 246 ----- .../src/ActivityProfilerController.h | 84 -- .../libkineto/src/ActivityProfilerProxy.cpp | 119 --- .../libkineto/src/ActivityProfilerProxy.h | 73 -- .../libkineto/src/ActivityTrace.h | 45 - .../libkineto/src/ActivityType.cpp | 58 -- .../libkineto/src/Config.cpp | 473 ---------- .../libkineto/src/ConfigLoader.cpp | 300 ------- .../libkineto/src/ConfigLoader.h | 147 --- .../libkineto/src/CudaDeviceProperties.cpp | 130 --- .../libkineto/src/CudaDeviceProperties.h | 31 - .../libkineto/src/CuptiActivity.h | 114 --- .../libkineto/src/CuptiActivity.tpp | 111 --- .../libkineto/src/CuptiActivityApi.cpp | 343 ------- .../libkineto/src/CuptiActivityApi.h | 100 --- .../libkineto/src/CuptiActivityBuffer.h | 51 -- .../libkineto/src/CuptiActivityPlatform.cpp | 31 - .../libkineto/src/CuptiActivityPlatform.h | 12 - .../libkineto/src/CuptiActivityProfiler.cpp | 841 ------------------ .../libkineto/src/CuptiActivityProfiler.h | 364 -------- .../libkineto/src/CuptiCallbackApi.cpp | 260 ------ .../libkineto/src/CuptiCallbackApi.h | 130 --- .../libkineto/src/CuptiCallbackApiMock.h | 32 - .../libkineto/src/CuptiEventApi.cpp | 112 --- .../libkineto/src/CuptiEventApi.h | 49 - .../libkineto/src/CuptiMetricApi.cpp | 107 --- .../libkineto/src/CuptiMetricApi.h | 38 - .../libkineto/src/CuptiNvPerfMetric.cpp | 504 ----------- .../libkineto/src/CuptiNvPerfMetric.h | 71 -- .../libkineto/src/CuptiRangeProfilerApi.cpp | 751 ---------------- .../libkineto/src/CuptiRangeProfilerApi.h | 220 ----- .../src/CuptiRangeProfilerConfig.cpp | 68 -- .../libkineto/src/CuptiRangeProfilerConfig.h | 86 -- .../libkineto/src/DaemonConfigLoader.h | 27 - .../libkineto/src/Demangle.cpp | 49 - .../libkineto/src/Demangle.h | 12 - .../libkineto/src/EventProfiler.cpp | 635 ------------- .../libkineto/src/EventProfiler.h | 341 ------- .../libkineto/src/EventProfilerController.cpp | 423 --------- .../libkineto/src/EventProfilerController.h | 63 -- .../libkineto/src/GenericTraceActivity.cpp | 10 - .../libkineto/src/ILoggerObserver.cpp | 54 -- .../libkineto/src/Logger.cpp | 136 --- .../libkineto/src/Logger.h | 244 ----- .../libkineto/src/LoggerCollector.h | 70 -- .../libkineto/src/RoctracerActivityApi.cpp | 569 ------------ .../libkineto/src/RoctracerActivityApi.h | 171 ---- .../libkineto/src/RoctracerActivityBuffer.h | 30 - .../libkineto/src/SampleListener.h | 146 --- .../libkineto/src/ScopeExit.h | 29 - .../libkineto/src/ThreadUtil.cpp | 203 ----- .../libkineto/src/WeakSymbols.cpp | 12 - .../libkineto/src/cupti_call.h | 33 - .../libkineto/src/cupti_strings.cpp | 502 ----------- .../libkineto/src/cupti_strings.h | 14 - .../libkineto/src/init.cpp | 139 --- .../libkineto/src/libkineto_api.cpp | 41 - .../libkineto/src/output_base.h | 104 --- .../libkineto/src/output_csv.cpp | 88 -- .../libkineto/src/output_csv.h | 39 - .../libkineto/src/output_json.cpp | 583 ------------ .../libkineto/src/output_json.h | 91 -- .../libkineto/src/output_membuf.h | 130 --- .../libkineto/test/CMakeLists.txt | 3 - .../libkineto/test/ConfigTest.cpp | 315 ------- .../test/CuptiActivityProfilerTest.cpp | 629 ------------- .../libkineto/test/CuptiCallbackApiTest.cpp | 239 ----- .../libkineto/test/CuptiProfilerApiTest.cu | 353 -------- .../test/CuptiRangeProfilerApiTest.cpp | 113 --- .../test/CuptiRangeProfilerConfigTest.cpp | 67 -- .../test/CuptiRangeProfilerTestUtil.h | 96 -- .../libkineto/test/CuptiStringsTest.cpp | 29 - .../libkineto/test/EventProfilerTest.cpp | 578 ------------ .../libkineto/test/LoggerObserverTest.cpp | 96 -- .../test/MockActivitySubProfiler.cpp | 49 - .../libkineto/test/MockActivitySubProfiler.h | 72 -- .../libkineto/test/PidInfoTest.cpp | 27 - 109 files changed, 16063 deletions(-) delete mode 100644 plugins/tensorboard-plugins/.github/workflows/libkineto_ci.yml delete mode 100644 plugins/tensorboard-plugins/.github/workflows/tb_plugin_build_pip_package.yml delete mode 100644 plugins/tensorboard-plugins/.github/workflows/tb_plugin_ci.yml delete mode 100644 plugins/tensorboard-plugins/.gitignore delete mode 100644 plugins/tensorboard-plugins/.gitmodules delete mode 100644 plugins/tensorboard-plugins/CODE_OF_CONDUCT.md delete mode 100644 plugins/tensorboard-plugins/CONTRIBUTING.md delete mode 100644 plugins/tensorboard-plugins/LICENSE delete mode 100644 plugins/tensorboard-plugins/README.md delete mode 100644 plugins/tensorboard-plugins/libkineto/CMakeLists.txt delete mode 100644 plugins/tensorboard-plugins/libkineto/README.md delete mode 100644 plugins/tensorboard-plugins/libkineto/include/AbstractConfig.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/ActivityProfilerInterface.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/ActivityTraceInterface.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/ActivityType.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/ClientInterface.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/Config.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/GenericTraceActivity.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/IActivityProfiler.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/ILoggerObserver.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/ITraceActivity.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/ThreadUtil.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/TraceSpan.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/libkineto.h delete mode 100644 plugins/tensorboard-plugins/libkineto/include/time_since_epoch.h delete mode 100644 plugins/tensorboard-plugins/libkineto/libkineto_defs.bzl delete mode 100644 plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cu delete mode 100644 plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cuh delete mode 100644 plugins/tensorboard-plugins/libkineto/src/AbstractConfig.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityBuffers.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityLoggerFactory.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityTrace.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ActivityType.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/Config.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ConfigLoader.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ConfigLoader.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivity.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivity.tpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivityBuffer.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApiMock.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/DaemonConfigLoader.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/Demangle.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/Demangle.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/EventProfiler.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/EventProfiler.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/EventProfilerController.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/EventProfilerController.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/GenericTraceActivity.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ILoggerObserver.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/Logger.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/Logger.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/LoggerCollector.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/RoctracerActivityBuffer.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/SampleListener.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ScopeExit.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/ThreadUtil.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/WeakSymbols.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/cupti_call.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/cupti_strings.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/cupti_strings.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/init.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/libkineto_api.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/output_base.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/output_csv.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/output_csv.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/output_json.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/src/output_json.h delete mode 100644 plugins/tensorboard-plugins/libkineto/src/output_membuf.h delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CMakeLists.txt delete mode 100644 plugins/tensorboard-plugins/libkineto/test/ConfigTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CuptiActivityProfilerTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CuptiCallbackApiTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CuptiProfilerApiTest.cu delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerApiTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerConfigTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerTestUtil.h delete mode 100644 plugins/tensorboard-plugins/libkineto/test/CuptiStringsTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/EventProfilerTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/LoggerObserverTest.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.cpp delete mode 100644 plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.h delete mode 100644 plugins/tensorboard-plugins/libkineto/test/PidInfoTest.cpp diff --git a/plugins/tensorboard-plugins/.github/workflows/libkineto_ci.yml b/plugins/tensorboard-plugins/.github/workflows/libkineto_ci.yml deleted file mode 100644 index 3133d6400fb..00000000000 --- a/plugins/tensorboard-plugins/.github/workflows/libkineto_ci.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: LIBKINETOCI - -on: - push: - branches: - - main - pull_request: - branches: - - main - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@v2 - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 - - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - c++ --verbose - - # TODO: Figure out how to install cupti headers T84637671 - - name: Build static lib - run: | - set -e - mkdir build_static - cd build_static - cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/ - make -j - - - name: Build shared lib - run: | - set -e - mkdir build_shared - cd build_shared - cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/ - make -j diff --git a/plugins/tensorboard-plugins/.github/workflows/tb_plugin_build_pip_package.yml b/plugins/tensorboard-plugins/.github/workflows/tb_plugin_build_pip_package.yml deleted file mode 100644 index 9bdafcc4426..00000000000 --- a/plugins/tensorboard-plugins/.github/workflows/tb_plugin_build_pip_package.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: Build torch-tb-profiler Pip Package - -on: - # TODO: Add an on_release trigger to build on tags - workflow_dispatch: - -jobs: - build-package: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: build pip package - run: | - set -e - cd tb_plugin - python setup.py sdist bdist_wheel - cd dist/ - pip install *.whl - python -c "import torch_tb_profiler;print(torch_tb_profiler.__version__)" diff --git a/plugins/tensorboard-plugins/.github/workflows/tb_plugin_ci.yml b/plugins/tensorboard-plugins/.github/workflows/tb_plugin_ci.yml deleted file mode 100644 index 1b59a7bf90a..00000000000 --- a/plugins/tensorboard-plugins/.github/workflows/tb_plugin_ci.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: TB_Plugin_CI - -on: - push: - branches: - - main - - release/** - - plugin/** - - pull_request: - branches: - - main - - release/** - - plugin/** - -jobs: - generate-matrix: - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - id: set-matrix - run: | - echo $GITHUB_BASE_REF - if [ $GITHUB_BASE_REF == "plugin/vnext" ] - then - echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\"]}" - else - echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\", \"1.11rc\", \"stable\"]}" - fi - - build: - needs: generate-matrix - runs-on: ubuntu-latest - strategy: - matrix: ${{fromJSON(needs.generate-matrix.outputs.matrix)}} - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - architecture: 'x64' - - name: Test - env: - CUDA_VERSION: ${{ matrix.cuda-version }} - PYTORCH_VERSION: ${{ matrix.pytorch-version }} - TORCH_PROFILER_LOG_LEVEL: DEBUG - GRPC_VERBOSITY: DEBUG - GRPC_ENABLE_FORK_SUPPORT: 'False' - run: | - set -e - cd tb_plugin - sh ./ci_scripts/install_env.sh - pip install .[gs] - cd test - pytest diff --git a/plugins/tensorboard-plugins/.gitignore b/plugins/tensorboard-plugins/.gitignore deleted file mode 100644 index ce186381c0b..00000000000 --- a/plugins/tensorboard-plugins/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# ignore common items -.idea -.vscode diff --git a/plugins/tensorboard-plugins/.gitmodules b/plugins/tensorboard-plugins/.gitmodules deleted file mode 100644 index 4660ee8bc9e..00000000000 --- a/plugins/tensorboard-plugins/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "libkineto/third_party/googletest"] - path = libkineto/third_party/googletest - url = https://github.com/google/googletest.git -[submodule "libkineto/third_party/fmt"] - path = libkineto/third_party/fmt - url = https://github.com/fmtlib/fmt.git diff --git a/plugins/tensorboard-plugins/CODE_OF_CONDUCT.md b/plugins/tensorboard-plugins/CODE_OF_CONDUCT.md deleted file mode 100644 index a0cbeaab765..00000000000 --- a/plugins/tensorboard-plugins/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,77 +0,0 @@ -# Code of Conduct - -## Our Pledge - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to make participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. - -## Our Standards - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or - advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -## Scope - -This Code of Conduct applies within all project spaces, and it also applies when -an individual is representing the project or its community in public spaces. -Examples of representing a project or community include using an official -project e-mail address, posting via an official social media account, or acting -as an appointed representative at an online or offline event. Representation of -a project may be further defined and clarified by project maintainers. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at . All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq - diff --git a/plugins/tensorboard-plugins/CONTRIBUTING.md b/plugins/tensorboard-plugins/CONTRIBUTING.md deleted file mode 100644 index a2e931bb6f0..00000000000 --- a/plugins/tensorboard-plugins/CONTRIBUTING.md +++ /dev/null @@ -1,34 +0,0 @@ -# Contributing to Kineto -We want to make contributing to this project as easy and transparent as -possible. - -## Code of Conduct -The code of conduct is described in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md). - -## Pull Requests -We actively welcome your pull requests. - -1. Fork the repo and create your branch from `main`. -2. If you've added code that should be tested, add tests. -3. If you've changed APIs, update the documentation. -4. Ensure the test suite passes. -5. Make sure your code lints. -6. If you haven't already, complete the Contributor License Agreement ("CLA"). - -## Contributor License Agreement ("CLA") -In order to accept your pull request, we need you to submit a CLA. You only need -to do this once to work on any of Facebook's open source projects. - -Complete your CLA here: - -## Issues -We use GitHub issues to track public bugs. Please ensure your description is -clear and has sufficient instructions to be able to reproduce the issue. - -Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe -disclosure of security bugs. In those cases, please go through the process -outlined on that page and do not file a public issue. - -## License -By contributing to Kineto, you agree that your contributions will be licensed -under the LICENSE file in the root directory of this source tree. diff --git a/plugins/tensorboard-plugins/LICENSE b/plugins/tensorboard-plugins/LICENSE deleted file mode 100644 index edb179715b5..00000000000 --- a/plugins/tensorboard-plugins/LICENSE +++ /dev/null @@ -1,33 +0,0 @@ -BSD License - -For Kineto software - -Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. - -All contributions by Microsoft: -Copyright (c) Microsoft Corporation. (The Azure AI Platform team) - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name Facebook nor the names of its contributors may be used to - endorse or promote products derived from this software without specific - prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/plugins/tensorboard-plugins/README.md b/plugins/tensorboard-plugins/README.md deleted file mode 100644 index 3a18f4c6239..00000000000 --- a/plugins/tensorboard-plugins/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Kineto - -Kineto is part of the PyTorch Profiler. - -The Kineto project was started to help enable -- **performance observability and diagnostics** across common ML bottleneck components -- **actionable recommendations** for common issues -- integration of external system-level profiling tools -- integration with popular visualization platforms and analysis pipelines - -A central component is libkineto, a profiling library with special focus on low-overhead GPU timeline tracing. - -The PyTorch Profiler TensorBoard plugin provides powerful and intuitive visualizations of profiling results, as well as actionable recommendations, and is the best way to experience the new PyTorch Profiler. - -## Libkineto -Libkineto is an in-process profiling library integrated with the PyTorch Profiler. Please refer to the [README](libkineto/README.md) file in the `libkineto` folder as well as documentation on the [new PyTorch Profiler API](https://pytorch.org/docs/master/profiler.html). - -## PyTorch TensorBoard Profiler NPU Plugin -The goal of the PyTorch TensorBoard Profiler is to provide a seamless and intuitive end-to-end profiling experience, including straightforward collection from PyTorch and insightful visualizations and recommendations in the TensorBoard UI. -Please refer to the [README](tb_plugin/README.md) file in the `tb_plugin` folder. - -## Future Development Direction: -Some areas we're currently working on: -- Support for tracing distributed workloads -- Trace processing, analysis and recommendation engine -- System-level activities, multiple tracing sources -- Profiling and monitoring daemon for larger scale deployments - -## Releases and Contributing -We will follow the PyTorch release schedule which roughly happens on a 3 month basis. - -We appreciate all contributions. If you are planning to contribute back bug-fixes, please do so without any further discussion. - -If you plan to contribute new features, please first open an issue and discuss the feature with us. Sending a PR without discussion might end up resulting in a rejected PR because we might be taking the infrastructure in a different direction than you might be aware of. We expect the architecture to keep evolving. - -## License -Kineto has a BSD-style license, as found in the [LICENSE](LICENSE) file. - diff --git a/plugins/tensorboard-plugins/libkineto/CMakeLists.txt b/plugins/tensorboard-plugins/libkineto/CMakeLists.txt deleted file mode 100644 index 63966de803a..00000000000 --- a/plugins/tensorboard-plugins/libkineto/CMakeLists.txt +++ /dev/null @@ -1,198 +0,0 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) - -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") - -#install libraries into correct locations on all platforms -include(GNUInstallDirs) - -# function to extract filelists from libkineto_defs.bzl file -find_package(PythonInterp) -function(get_filelist name outputvar) - execute_process( - COMMAND "${PYTHON_EXECUTABLE}" -c - "exec(open('libkineto_defs.bzl').read());print(';'.join(${name}))" - WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" - OUTPUT_VARIABLE _tempvar) - string(REPLACE "\n" "" _tempvar "${_tempvar}") - set(${outputvar} ${_tempvar} PARENT_SCOPE) -endfunction() - -project(kineto VERSION 0.1 LANGUAGES CXX C) - -set(KINETO_LIBRARY_TYPE "default" CACHE STRING - "Type of library (default, static or shared) to build") -set_property(CACHE KINETO_LIBRARY_TYPE PROPERTY STRINGS default shared) -option(KINETO_BUILD_TESTS "Build kineto unit tests" ON) - -set(LIBKINETO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src") -set(LIBKINETO_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include") -set(LIBKINETO_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) -set(LIBKINETO_THIRDPARTY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party") -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -#We should default to a Release build -if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "") - set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) -endif() - -if (NOT CUDA_SOURCE_DIR) - set(CUDA_SOURCE_DIR "$ENV{CUDA_SOURCE_DIR}") - message(INFO " CUDA_SOURCE_DIR = ${CUDA_SOURCE_DIR}") -endif() - -if (NOT ROCM_SOURCE_DIR) - set(ROCM_SOURCE_DIR "$ENV{ROCM_SOURCE_DIR}") - message(INFO " ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}") -endif() - -# Set LIBKINETO_NOCUPTI to explicitly disable CUPTI -# Otherwise, CUPTI is disabled if not found -IF (NOT CUDA_SOURCE_DIR OR NOT CUPTI_INCLUDE_DIR OR NOT CUDA_cupti_LIBRARY) - set(LIBKINETO_NOCUPTI ON CACHE BOOL "" FORCE) -endif() - -IF (NOT ROCM_SOURCE_DIR AND NOT ROCTRACER_INCLUDE_DIR) - set(LIBKINETO_NOROCTRACER ON CACHE BOOL "" FORCE) -endif() - -# Define file lists -if (LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER) - get_filelist("get_libkineto_cpu_only_srcs(with_api=False)" LIBKINETO_SRCS) - message(INFO " CUPTI unavailable or disabled - not building GPU profilers") -elseif(NOT LIBKINETO_NOROCTRACER) - get_filelist("get_libkineto_roctracer_srcs()" LIBKINETO_SRCS) - message(INFO " Building with roctracer") -else() - get_filelist("get_libkineto_cupti_srcs(with_api=False)" LIBKINETO_SRCS) -endif() -get_filelist("get_libkineto_public_headers()" LIBKINETO_PUBLIC_HEADERS) -get_filelist("get_libkineto_api_srcs()" LIBKINETO_API_SRCS) - -add_library(kineto_base OBJECT ${LIBKINETO_SRCS}) -add_library(kineto_api OBJECT ${LIBKINETO_API_SRCS}) - -# Make libraries depend on libkineto_defs.bzl -add_custom_target(libkineto_defs.bzl DEPENDS libkineto_defs.bzl) -add_dependencies(kineto_base libkineto_defs.bzl) - -set_target_properties(kineto_base kineto_api PROPERTIES - CXX_STANDARD 14 - CXX_STANDARD_REQUIRED YES - CXX_EXTENSIONS NO - CXX_VISIBILITY_PRESET hidden) - -set(KINETO_COMPILE_OPTIONS "-DKINETO_NAMESPACE=libkineto") -list(APPEND KINETO_COMPILE_OPTIONS "-DFMT_HEADER_ONLY") -if(NOT MSVC) - list(APPEND KINETO_COMPILE_OPTIONS "-std=c++14") -else() - list(APPEND KINETO_COMPILE_OPTIONS "/std:c++14") - list(APPEND KINETO_COMPILE_OPTIONS "-DWIN32_LEAN_AND_MEAN") - list(APPEND KINETO_COMPILE_OPTIONS "-DNOGDI") -endif() -if (NOT LIBKINETO_NOCUPTI) - list(APPEND KINETO_COMPILE_OPTIONS "-DHAS_CUPTI") -endif() -if (NOT LIBKINETO_NOROCTRACER) - target_compile_options(kineto_base PRIVATE "-DHAS_ROCTRACER") - target_compile_options(kineto_base PRIVATE "-D__HIP_PLATFORM_HCC__") - target_compile_options(kineto_base PRIVATE "-D__HIP_PLATFORM_AMD__") -endif() - -target_compile_options(kineto_base PRIVATE "${KINETO_COMPILE_OPTIONS}") -target_compile_options(kineto_api PRIVATE "${KINETO_COMPILE_OPTIONS}") - -if(NOT TARGET fmt) - if(NOT FMT_SOURCE_DIR) - set(FMT_SOURCE_DIR "${LIBKINETO_THIRDPARTY_DIR}/fmt" - CACHE STRING "fmt source directory from submodules") - endif() - - # Build FMT. - # FMT and some other libraries use BUILD_SHARED_LIBS to control - # the library type. - # Save and restore the value after configuring FMT - set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) - set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) - set(FMT_LIBRARY_TYPE static CACHE STRING "Set lib type to static") - add_subdirectory("${FMT_SOURCE_DIR}" "${LIBKINETO_BINARY_DIR}/fmt") - set_property(TARGET fmt PROPERTY POSITION_INDEPENDENT_CODE ON) - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) -endif() - -set(FMT_INCLUDE_DIR "${FMT_SOURCE_DIR}/include") -message(STATUS "Kineto: FMT_SOURCE_DIR = ${FMT_SOURCE_DIR}") -message(STATUS "Kineto: FMT_INCLUDE_DIR = ${FMT_INCLUDE_DIR}") -if (NOT CUPTI_INCLUDE_DIR) - set(CUPTI_INCLUDE_DIR "${CUDA_SOURCE_DIR}/extras/CUPTI/include") -endif() -if (NOT CUDA_INCLUDE_DIRS) - set(CUDA_INCLUDE_DIRS "${CUDA_SOURCE_DIR}/include") -endif() -if (NOT ROCTRACER_INCLUDE_DIR) - set(ROCTRACER_INCLUDE_DIR "${ROCM_SOURCE_DIR}/roctracer/include") -endif() -if (NOT ROCM_INCLUDE_DIRS) - set(ROCM_INCLUDE_DIRS "${ROCM_SOURCE_DIR}/include") -endif() - -message(INFO " CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}") -message(INFO " ROCTRACER_INCLUDE_DIR = ${ROCTRACER_INCLUDE_DIR}") - -target_include_directories(kineto_base PUBLIC - $ - $ - $ - $ - $ - $ - $) - -target_include_directories(kineto_api PUBLIC - $ - $) - -if(KINETO_LIBRARY_TYPE STREQUAL "default") - add_library(kineto - $ - $) -elseif(KINETO_LIBRARY_TYPE STREQUAL "static") - add_library(kineto STATIC - $ - $) -elseif(KINETO_LIBRARY_TYPE STREQUAL "shared") - add_library(kineto SHARED - $) - set_property(TARGET kineto_base PROPERTY POSITION_INDEPENDENT_CODE ON) - set_target_properties(kineto PROPERTIES - CXX_VISIBILITY_PRESET hidden) -else() - message(FATAL_ERROR "Unsupported library type ${KINETO_LIBRARY_TYPE}") -endif() - -if(NOT LIBKINETO_NOROCTRACER) - find_library(ROCTRACER_LIBRARY NAMES libroctracer64.so HINTS /opt/rocm/roctracer/lib) - target_link_libraries(kineto "${ROCTRACER_LIBRARY}") - find_library(KINETO_HIP_LIBRARY NAMES libamdhip64.so HINTS /opt/rocm/lib) - target_link_libraries(kineto "${KINETO_HIP_LIBRARY}") -endif() - -if(NOT LIBKINETO_NOCUPTI) - target_link_libraries(kineto "${CUDA_cupti_LIBRARY}") -endif() -target_link_libraries(kineto $) -add_dependencies(kineto fmt::fmt-header-only) - -install(TARGETS kineto EXPORT kinetoLibraryConfig - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) - -install(FILES ${LIBKINETO_PUBLIC_HEADERS} - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/kineto") - -install(EXPORT kinetoLibraryConfig DESTINATION share/cmake/kineto - FILE kinetoLibraryConfig.cmake) - -if(KINETO_BUILD_TESTS) - add_subdirectory(test) -endif() diff --git a/plugins/tensorboard-plugins/libkineto/README.md b/plugins/tensorboard-plugins/libkineto/README.md deleted file mode 100644 index 37127ca5aa8..00000000000 --- a/plugins/tensorboard-plugins/libkineto/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Libkineto - -Libkineto is an in-process profiling library, part of the Kineto performance -tools project. - -The library provides a way to collect GPU traces and metrics from the host -process, either via the library public API or by sending a signal, if enabled. - -Currently only NVIDIA GPUs are supported. - -## Build Notes -Libkineto uses the standard CMAKE-based build flow. - -### Dependencies -Libkineto requires gcc 5+ and: - -- NVIDIA CUPTI: used to collect traces and metrics from NVIDIA GPUs. -- fmt: used for its convenient and lightweight string formatting functionality. -- googletest: required to build and run Kineto's tests. - - **googletest is not required** if you don't want to run Kineto tests. -By default, building of tests is **on**. Turn it off by setting `KINETO_BUILD_TESTS` to **off**. - -You can download [NVIDIA CUPTI][1], [fmt][2], [googletest][3] and set -`CUDA_SOURCE_DIR`, `FMT_SOURCE_DIR`, `GOOGLETEST_SOURCE_DIR` respectively for -cmake to find these libraries. If the fmt and googletest variables are not set, cmake will -build the git submodules found in the `third_party` directory. -If `CUDA_SOURCE_DIR` is not set, libkineto will fail to build. - -### Building Libkineto - -``` -# Check out repo and sub modules -git clone --recursive https://github.com/pytorch/kineto.git -# Build libkineto with cmake -cd kineto/libkineto -mkdir build && cd build -cmake .. -make -``` - -To run the tests after building libkineto (if tests are built), use the following -command: -``` -make test -``` - -### Installing Libkineto -``` -make install -``` - -## How Libkineto works -We will provide a high-level overview, design philosophy and brief descriptions of various -parts of Libkineto in upcoming blogs. - -## Full documentation -We strive to keep our source files readable. The best and up-to-date -documentation is available in the source files. - -## License -Libkineto is BSD licensed, as detailed in the [LICENSE](../LICENSE) file. - -[1]:https://developer.nvidia.com/CUPTI-CTK10_2 -[2]:https://github.com/fmt -[3]:https://github.com/google/googletest diff --git a/plugins/tensorboard-plugins/libkineto/include/AbstractConfig.h b/plugins/tensorboard-plugins/libkineto/include/AbstractConfig.h deleted file mode 100644 index 1cadf4906c1..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/AbstractConfig.h +++ /dev/null @@ -1,113 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -class AbstractConfig { - public: - AbstractConfig& operator=(const AbstractConfig&) = delete; - AbstractConfig(AbstractConfig&&) = delete; - AbstractConfig& operator=(AbstractConfig&&) = delete; - - virtual ~AbstractConfig() { - for (const auto& p : featureConfigs_) { - delete p.second; - } - } - - // Return a copy of the full derived class - virtual AbstractConfig* cloneDerived(AbstractConfig& parent) const = 0; - - // Returns true if successfully parsed the config string - bool parse(const std::string& conf); - - // Default setup for signal-triggered profiling - virtual void setSignalDefaults() { - for (auto& p : featureConfigs_) { - p.second->setSignalDefaults(); - } - } - - // Default setup for client-triggered profiling - virtual void setClientDefaults() { - for (auto& p : featureConfigs_) { - p.second->setClientDefaults(); - } - } - - // Time config was created / updated - std::chrono::time_point timestamp() const { - return timestamp_; - } - - // Source config string that this was parsed from - const std::string& source() const { - return source_; - } - - AbstractConfig& feature(std::string name) const { - const auto& pos = featureConfigs_.find(name); - return *pos->second; - } - - // Transfers ownership of cfg arg - void addFeature(const std::string& name, AbstractConfig* cfg) { - featureConfigs_[name] = cfg; - } - - protected: - AbstractConfig() {} - AbstractConfig(const AbstractConfig& other) = default; - - // Return true if the option was recognized and successfully parsed. - // Throw std::invalid_argument if val is invalid. - virtual bool handleOption(const std::string& name, std::string& val); - - // Perform post-validation checks, typically conditons involving - // multiple options. - // Throw std::invalid_argument if automatic correction can not be made. - // - // @param fallbackProfileStartTime Specify a fallback profile start timestamp in case it was never specified by the client - virtual void validate(const std::chrono::time_point& fallbackProfileStartTime) = 0; - - // TODO: Separate out each profiler type into features? - virtual void printActivityProfilerConfig(std::ostream& s) const; - - // Helpers for use in handleOption - // Split a string by delimiter and remove external white space - std::vector splitAndTrim(const std::string& s, char delim) const; - // Lowercase for case-insensitive comparisons - std::string toLower(std::string& s) const; - // Does string end with suffix - bool endsWith(const std::string& s, const std::string& suffix) const; - // Conversions - int64_t toIntRange(const std::string& val, int64_t min, int64_t max) const; - int32_t toInt32(const std::string& val) const; - int64_t toInt64(const std::string& val) const; - bool toBool(std::string& val) const; - - void cloneFeaturesInto(AbstractConfig& cfg) const { - for (const auto& feature : featureConfigs_) { - cfg.featureConfigs_[feature.first] = feature.second->cloneDerived(cfg); - } - } - - private: - // Time config was created / updated - std::chrono::time_point timestamp_{}; - - // Original configuration string, used for comparison - std::string source_{""}; - - // Configuration objects for optional features - std::map featureConfigs_{}; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/include/ActivityProfilerInterface.h b/plugins/tensorboard-plugins/libkineto/include/ActivityProfilerInterface.h deleted file mode 100644 index 29871e47ab8..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/ActivityProfilerInterface.h +++ /dev/null @@ -1,91 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include - -#include "ActivityType.h" -#include "ActivityTraceInterface.h" -#include "IActivityProfiler.h" - -namespace libkineto { - -class ActivityProfilerController; -struct CpuTraceBuffer; -class Config; - -class ActivityProfilerInterface { - - public: - virtual ~ActivityProfilerInterface() {}; - - virtual void init() {} - virtual bool isInitialized() { - return false; - } - virtual bool isActive(){ - return false; - } - - // *** Asynchronous API *** - // Instead of starting and stopping the trace manually, provide a start time - // and duration and / or iteration stop criterion. - // Tracing terminates when either condition is met. - virtual void scheduleTrace(const std::string& configStr) {} - - // *** Synchronous API *** - // These must be called in order: - // prepareTrace -> startTrace -> stopTrace. - - // Many tracing structures are lazily initialized during trace collection, - // with potentially high overhead. - // Call prepareTrace to enable tracing, then run the region to trace - // at least once (and ideally run the same code that is to be traced) to - // allow tracing structures to be initialized. - virtual void prepareTrace( - const std::set& activityTypes, - const std::string& configStr = "") {} - - // Start recording, potentially reusing any buffers allocated since - // prepareTrace was called. - virtual void startTrace() {} - - // Stop and process trace, producing an in-memory list of trace records. - // The processing will be done synchronously (using the calling thread.) - virtual std::unique_ptr stopTrace() { - return nullptr; - } - - // Re-evaluate internal state to allow for triggering operations based - // on number of iteration. each implicitly increments the iteration count - virtual void step() {} - - // *** TraceActivity API *** - // FIXME: Pass activityProfiler interface into clientInterface? - virtual void pushCorrelationId(uint64_t id){} - virtual void popCorrelationId(){} - virtual void transferCpuTrace( - std::unique_ptr traceBuffer){} - - // Correlation ids for user defined spans - virtual void pushUserCorrelationId(uint64_t){} - virtual void popUserCorrelationId(){} - - // Saves information for the current thread to be used in profiler output - // Client must record any new kernel thread where the activity has occured. - virtual void recordThreadInfo() {} - - // Record trace metadata, currently supporting only string key and values, - // values with the same key are overwritten - virtual void addMetadata(const std::string& key, const std::string& value) = 0; - - // Add a child activity profiler, this enables frameworks in the application - // to enable custom framework events. - virtual void addChildActivityProfiler( - std::unique_ptr profiler) {} -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/ActivityTraceInterface.h b/plugins/tensorboard-plugins/libkineto/include/ActivityTraceInterface.h deleted file mode 100644 index 23d4edab00c..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/ActivityTraceInterface.h +++ /dev/null @@ -1,21 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include - -namespace libkineto { - -struct ITraceActivity; - -class ActivityTraceInterface { - public: - virtual ~ActivityTraceInterface() {} - virtual const std::vector* activities() { - return nullptr; - } - virtual void save(const std::string& path) {} -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/ActivityType.h b/plugins/tensorboard-plugins/libkineto/include/ActivityType.h deleted file mode 100644 index 74c6a2531d6..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/ActivityType.h +++ /dev/null @@ -1,34 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include - -namespace libkineto { - -enum class ActivityType { - CPU_OP = 0, // cpu side ops - USER_ANNOTATION, - GPU_USER_ANNOTATION, - GPU_MEMCPY, - GPU_MEMSET, - CONCURRENT_KERNEL, // on-device kernels - EXTERNAL_CORRELATION, - CUDA_RUNTIME, // host side cuda runtime events - CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics - GLOW_RUNTIME, // host side glow runtime events - CPU_INSTANT_EVENT, // host side point-like events - PYTHON_FUNCTION, - OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. - ENUM_COUNT // This is to add buffer and not used for any profiling logic. Add your new type before it. -}; - -const char* toString(ActivityType t); -ActivityType toActivityType(const std::string& str); - -// Return an array of all activity types except COUNT -constexpr int activityTypeCount = (int)ActivityType::ENUM_COUNT; -const std::array activityTypes(); - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/ClientInterface.h b/plugins/tensorboard-plugins/libkineto/include/ClientInterface.h deleted file mode 100644 index 06dc0758381..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/ClientInterface.h +++ /dev/null @@ -1,16 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -namespace libkineto { - -class ClientInterface { - public: - virtual ~ClientInterface() {} - virtual void init() = 0; - virtual void warmup(bool setupOpInputsCollection) = 0; - virtual void start() = 0; - virtual void stop() = 0; -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/Config.h b/plugins/tensorboard-plugins/libkineto/include/Config.h deleted file mode 100644 index 040e96c9f75..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/Config.h +++ /dev/null @@ -1,433 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include "AbstractConfig.h" -#include "ActivityType.h" - -#include -#include -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -class Config : public AbstractConfig { - public: - Config(); - Config& operator=(const Config&) = delete; - Config(Config&&) = delete; - Config& operator=(Config&&) = delete; - - // Return a full copy including feature config object - std::unique_ptr clone() const { - auto cfg = std::unique_ptr(new Config(*this)); - cloneFeaturesInto(*cfg); - return cfg; - } - - bool handleOption(const std::string& name, std::string& val) override; - - void setClientDefaults() override; - - // Log events to this file - const std::string& eventLogFile() const { - return eventLogFile_; - } - - bool activityProfilerEnabled() const { - return activityProfilerEnabled_ || - activitiesOnDemandTimestamp_.time_since_epoch().count() > 0; - } - - // Log activitiy trace to this file - const std::string& activitiesLogFile() const { - return activitiesLogFile_; - } - - // Log activitiy trace to this url - const std::string& activitiesLogUrl() const { - return activitiesLogUrl_; - } - - void setActivitiesLogUrl(const std::string& url) { - activitiesLogUrl_ = url; - } - - bool activitiesLogToMemory() const { - return activitiesLogToMemory_; - } - - // Is profiling enabled for the given device? - bool eventProfilerEnabledForDevice(uint32_t dev) const { - return 0 != (eventProfilerDeviceMask_ & (1 << dev)); - } - - // Take a sample (read hardware counters) at this frequency. - // This controls how often counters are read - if all counters cannot - // be collected simultaneously then multiple samples are needed to - // collect all requested counters - see multiplex period. - std::chrono::milliseconds samplePeriod() const { - return samplePeriod_; - } - - void setSamplePeriod(std::chrono::milliseconds period) { - samplePeriod_ = period; - } - - // When all requested counters cannot be collected simultaneously, - // counters will be multiplexed at this frequency. - // Multiplexing can have a large performance impact if done frequently. - // To avoid a perf impact, keep this at 1s or above. - std::chrono::milliseconds multiplexPeriod() const { - return multiplexPeriod_; - } - - void setMultiplexPeriod(std::chrono::milliseconds period) { - multiplexPeriod_ = period; - } - - // Report counters at this frequency. Note that several samples can - // be reported each time, see samplesPerReport. - std::chrono::milliseconds reportPeriod() const { - return reportPeriod_; - } - - void setReportPeriod(std::chrono::milliseconds msecs); - - // Number of samples dispatched each report period. - // Must be in the range [1, report period / sample period]. - // In other words, aggregation is supported but not interpolation. - int samplesPerReport() const { - return samplesPerReport_; - } - - void setSamplesPerReport(int count) { - samplesPerReport_ = count; - } - - // The names of events to collect - const std::set& eventNames() const { - return eventNames_; - } - - // Add additional events to be profiled - void addEvents(const std::set& names) { - eventNames_.insert(names.begin(), names.end()); - } - - // The names of metrics to collect - const std::set& metricNames() const { - return metricNames_; - } - - // Add additional metrics to be profiled - void addMetrics(const std::set& names) { - metricNames_.insert(names.begin(), names.end()); - } - - const std::vector& percentiles() const { - return eventReportPercentiles_; - } - - // Profile for this long, then revert to base config - std::chrono::seconds eventProfilerOnDemandDuration() const { - return eventProfilerOnDemandDuration_; - } - - void setEventProfilerOnDemandDuration(std::chrono::seconds duration) { - eventProfilerOnDemandDuration_ = duration; - } - - // Too many event profilers on a single system can overload the driver. - // At some point, latencies shoot through the roof and collection of samples - // becomes impossible. To avoid this situation we have a limit of profilers - // per GPU. - // NOTE: Communication with a daemon is needed for this feature. - // Library must be built with an active DaemonConfigLoader. - int maxEventProfilersPerGpu() const { - return eventProfilerMaxInstancesPerGpu_; - } - - // On Cuda11 we've seen occasional hangs when reprogramming counters - // Monitor profiling threads and report when a thread is not responding - // for a given number of seconds. - // A period of 0 means disable. - std::chrono::seconds eventProfilerHeartbeatMonitorPeriod() const { - return eventProfilerHeartbeatMonitorPeriod_; - } - - // The types of activities selected in the configuration file - const std::set& selectedActivityTypes() const { - return selectedActivityTypes_; - } - - void setSelectedActivityTypes(const std::set& types) { - selectedActivityTypes_ = types; - } - - bool isOpInputsCollectionEnabled() const { - return enableOpInputsCollection_; - } - - // Trace for this long - std::chrono::milliseconds activitiesDuration() const { - return activitiesDuration_; - } - - // Trace for this many iterations, determined by external API - int activitiesRunIterations() const { - return activitiesRunIterations_; - } - - std::chrono::milliseconds activitiesDurationDefault() const; - - void setActivitiesDuration(std::chrono::milliseconds duration) { - activitiesDuration_ = duration; - } - - int activitiesMaxGpuBufferSize() const { - return activitiesMaxGpuBufferSize_; - } - - std::chrono::seconds activitiesWarmupDuration() const { - return activitiesWarmupDuration_; - } - - int activitiesWarmupIterations() const { - return activitiesWarmupIterations_; - } - - // Timestamp at which the profiling to start, requested by the user. - const std::chrono::time_point requestTimestamp() - const { - if (profileStartTime_.time_since_epoch().count()) { - return profileStartTime_; - } - - // TODO(T94634890): Deperecate requestTimestamp - return requestTimestamp_ + maxRequestAge() + activitiesWarmupDuration(); - } - - bool hasProfileStartTime() const { - return requestTimestamp_.time_since_epoch().count() > 0 || - profileStartTime_.time_since_epoch().count() > 0; - } - - int profileStartIteration() const { - return profileStartIteration_; - } - - bool hasProfileStartIteration() const { - return profileStartIteration_ >= 0 && activitiesRunIterations_ > 0; - } - - void setProfileStartIteration(int iter) { - profileStartIteration_ = iter; - } - - int profileStartIterationRoundUp() const { - return profileStartIterationRoundUp_; - } - - // calculate the start iteration accounting for warmup - int startIterationIncludingWarmup() const { - if (!hasProfileStartIteration()) { - return -1; - } - return profileStartIteration_ - activitiesWarmupIterations_; - } - - const std::chrono::seconds maxRequestAge() const; - - // All VLOG* macros will log if the verbose log level is >= - // the verbosity specified for the verbose log message. - // Default value is -1, so messages with log level 0 will log by default. - int verboseLogLevel() const { - return verboseLogLevel_; - } - - // Modules for which verbose logging is enabled. - // If empty, logging is enabled for all modules. - const std::vector& verboseLogModules() const { - return verboseLogModules_; - } - - bool sigUsr2Enabled() const { - return enableSigUsr2_; - } - - bool ipcFabricEnabled() const { - return enableIpcFabric_; - } - - static std::chrono::milliseconds alignUp( - std::chrono::milliseconds duration, - std::chrono::milliseconds alignment) { - duration += alignment; - return duration - (duration % alignment); - } - - std::chrono::time_point - eventProfilerOnDemandStartTime() const { - return eventProfilerOnDemandTimestamp_; - } - - std::chrono::time_point - eventProfilerOnDemandEndTime() const { - return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_; - } - - std::chrono::time_point - activityProfilerRequestReceivedTime() const { - return activitiesOnDemandTimestamp_; - } - - // Users may request and set trace id and group trace id. - const std::string& requestTraceID() const { - return requestTraceID_; - } - - void setRequestTraceID(const std::string& tid) { - requestTraceID_ = tid; - } - - const std::string& requestGroupTraceID() const { - return requestGroupTraceID_; - } - - void setRequestGroupTraceID(const std::string& gtid) { - requestGroupTraceID_ = gtid; - } - - void updateActivityProfilerRequestReceivedTime(); - - void printActivityProfilerConfig(std::ostream& s) const override; - - void validate( - const std::chrono::time_point& fallbackProfileStartTime) override; - - static void addConfigFactory( - std::string name, - std::function factory); - - void print(std::ostream& s) const; - - private: - explicit Config(const Config& other) = default; - - AbstractConfig* cloneDerived(AbstractConfig& parent) const override { - // Clone from AbstractConfig not supported - assert(false); - return nullptr; - } - - uint8_t createDeviceMask(const std::string& val); - - // Adds valid activity types from the user defined string list in the - // configuration file - void setActivityTypes(const std::vector& selected_activities); - - // Sets the default activity types to be traced - void selectDefaultActivityTypes() { - // If the user has not specified an activity list, add all types - for (ActivityType t : activityTypes()) { - // Do no enable this by default - // TODO: introduce optional types - if (t != ActivityType::OVERHEAD) { - selectedActivityTypes_.insert(t); - } - } - } - - int verboseLogLevel_; - std::vector verboseLogModules_; - - // Event profiler - // These settings are also supported in on-demand mode - std::chrono::milliseconds samplePeriod_; - std::chrono::milliseconds reportPeriod_; - int samplesPerReport_; - std::set eventNames_; - std::set metricNames_; - - // On-demand duration - std::chrono::seconds eventProfilerOnDemandDuration_; - // Last on-demand request - std::chrono::time_point - eventProfilerOnDemandTimestamp_; - - int eventProfilerMaxInstancesPerGpu_; - - // Monitor whether event profiler threads are stuck - // at this frequency - std::chrono::seconds eventProfilerHeartbeatMonitorPeriod_; - - // These settings can not be changed on-demand - std::string eventLogFile_; - std::vector eventReportPercentiles_ = {5, 25, 50, 75, 95}; - uint8_t eventProfilerDeviceMask_ = ~0; - std::chrono::milliseconds multiplexPeriod_; - - // Activity profiler - bool activityProfilerEnabled_; - std::set selectedActivityTypes_; - - // The activity profiler settings are all on-demand - std::string activitiesLogFile_; - - std::string activitiesLogUrl_; - - // Log activities to memory buffer - bool activitiesLogToMemory_{false}; - - int activitiesMaxGpuBufferSize_; - std::chrono::seconds activitiesWarmupDuration_; - int activitiesWarmupIterations_; - - // Client Interface - // Enable inputs collection when tracing ops - bool enableOpInputsCollection_{true}; - - // Profile for specified iterations and duration - std::chrono::milliseconds activitiesDuration_; - int activitiesRunIterations_; - - // Below are not used - // Use this net name for iteration count - std::string activitiesExternalAPIIterationsTarget_; - // Only profile nets that includes this in the name - std::vector activitiesExternalAPIFilter_; - // Only profile nets with at least this many operators - int activitiesExternalAPINetSizeThreshold_; - // Only profile nets with at least this many GPU operators - int activitiesExternalAPIGpuOpCountThreshold_; - // Last activity profiler request - std::chrono::time_point - activitiesOnDemandTimestamp_; - - // Synchronized start timestamp - std::chrono::time_point profileStartTime_; - // or start iteration - int profileStartIteration_; - int profileStartIterationRoundUp_; - - // DEPRECATED - std::chrono::time_point requestTimestamp_; - - // Enable profiling via SIGUSR2 - bool enableSigUsr2_; - - // Enable IPC Fabric instead of thrift communication - bool enableIpcFabric_; - - // Logger Metadata - std::string requestTraceID_; - std::string requestGroupTraceID_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/include/GenericTraceActivity.h b/plugins/tensorboard-plugins/libkineto/include/GenericTraceActivity.h deleted file mode 100644 index 4272cf1efa4..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/GenericTraceActivity.h +++ /dev/null @@ -1,125 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include - -#include "ThreadUtil.h" -#include "ITraceActivity.h" -#include "TraceSpan.h" - -namespace libkineto { - -// Link type, used in GenericTraceActivity.flow.type -constexpr unsigned int kLinkFwdBwd = 1; -constexpr unsigned int kLinkAsyncCpuGpu = 2; - -// @lint-ignore-every CLANGTIDY cppcoreguidelines-non-private-member-variables-in-classes -// @lint-ignore-every CLANGTIDY cppcoreguidelines-pro-type-member-init -class GenericTraceActivity : public ITraceActivity { - - public: - GenericTraceActivity() : activityType(ActivityType::ENUM_COUNT), traceSpan_(NULL) {} - - GenericTraceActivity( - const TraceSpan& trace, ActivityType type, const std::string& name) - : activityType(type), activityName(name), traceSpan_(&trace) { - } - - int64_t deviceId() const override { - return device; - } - - int64_t resourceId() const override { - return resource; - } - - int32_t getThreadId() const override { - return threadId; - } - - int64_t timestamp() const override { - return startTime; - } - - int64_t duration() const override { - return endTime - startTime; - } - - int64_t correlationId() const override { - return id; - } - - ActivityType type() const override { - return activityType; - } - - const ITraceActivity* linkedActivity() const override { - return nullptr; - } - - int flowType() const override { - return flow.type; - } - - int flowId() const override { - return flow.id; - } - - bool flowStart() const override { - return flow.start; - } - - const std::string name() const override { - return activityName; - } - - const TraceSpan* traceSpan() const override { - return traceSpan_; - } - - void log(ActivityLogger& logger) const override; - - //Encode client side metadata as a key/value - template - void addMetadata(const std::string& key, const ValType& value) { - metadata_.push_back(fmt::format("\"{}\": {}", key, value)); - } - - void addMetadataQuoted(const std::string& key, const std::string& value) { - metadata_.push_back(fmt::format("\"{}\": \"{}\"", key, value)); - } - - const std::string metadataJson() const override { - return fmt::format("{}", fmt::join(metadata_, ", ")); - } - - virtual ~GenericTraceActivity() {}; - - int64_t startTime{0}; - int64_t endTime{0}; - int32_t id{0}; - int32_t device{0}; - int32_t resource{0}; - int32_t threadId{0}; - ActivityType activityType; - std::string activityName; - struct Flow { - Flow(): id(0), type(0), start(0) {} - // Ids must be unique within each type - uint32_t id : 27; - // Type will be used to connect flows between profilers, as - // well as look up flow information (name etc) - uint32_t type : 4; - uint32_t start : 1; - } flow; - - private: - const TraceSpan* traceSpan_; - std::vector metadata_; -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/IActivityProfiler.h b/plugins/tensorboard-plugins/libkineto/include/IActivityProfiler.h deleted file mode 100644 index f5d4b3fb828..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/IActivityProfiler.h +++ /dev/null @@ -1,104 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include - -#include "Config.h" -#include "GenericTraceActivity.h" - -/* This file includes an abstract base class for an activity profiler - * that can be implemented by multiple tracing agents in the application. - * The high level Kineto profiler can co-ordinate start and end of tracing - * and combine together events from multiple such activity profilers. - */ - -namespace libkineto { - -using namespace KINETO_NAMESPACE; - -#ifdef _MSC_VER -// workaround for the predefined ERROR macro on Windows -#undef ERROR -#endif // _MSC_VER - -enum class TraceStatus { - READY, // Accepting trace requests - WARMUP, // Performing trace warmup - RECORDING, // Actively collecting activities - PROCESSING, // Recording is complete, preparing results - ERROR, // One or more errors (and possibly also warnings) occurred. - WARNING, // One or more warnings occurred. -}; - -/* IActivityProfilerSession: - * an opaque object that can be used by a high level profiler to - * start/stop and return trace events. - */ -class IActivityProfilerSession { - - public: - virtual ~IActivityProfilerSession() {} - - // start the trace collection synchronously - virtual void start() = 0; - - // stop the trace collection synchronously - virtual void stop() = 0; - - TraceStatus status() { - return status_; - } - - // returns list of Trace Activities - virtual std::vector& activities() = 0; - - // returns errors with this trace - virtual std::vector errors() = 0; - - // processes trace activities using logger - virtual void processTrace(ActivityLogger& logger) = 0; - - // XXX define trace formats - // virtual save(string name, TraceFormat format) - - protected: - TraceStatus status_ = TraceStatus::READY; -}; - - -/* Activity Profiler Plugins: - * These allow other frameworks to integrate into Kineto's primariy - * activity profiler. While the primary activity profiler handles - * timing the trace collections and correlating events the plugins - * can become source of new trace activity types. - */ -class IActivityProfiler { - - public: - - virtual ~IActivityProfiler() {} - - // name of profiler - virtual const std::string& name() const = 0; - - // returns activity types this profiler supports - virtual const std::set& availableActivities() const = 0; - - // Calls prepare() on registered tracer providers passing in the relevant - // activity types. Returns a profiler session handle - virtual std::unique_ptr configure( - const std::set& activity_types, - const Config& config) = 0; - - // asynchronous version of the above with future timestamp and duration. - virtual std::unique_ptr configure( - int64_t ts_ms, - int64_t duration_ms, - const std::set& activity_types, - const Config& config) = 0; -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/ILoggerObserver.h b/plugins/tensorboard-plugins/libkineto/include/ILoggerObserver.h deleted file mode 100644 index 4fce7851b96..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/ILoggerObserver.h +++ /dev/null @@ -1,50 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -// Stages in libkineto used when pushing logs to UST Logger. -constexpr char kWarmUpStage[] = "Warm Up"; -constexpr char kCollectionStage[] = "Collection"; -constexpr char kPostProcessingStage[] = "Post Processing"; - -#if !USE_GOOGLE_LOG - -#include -#include - -namespace libkineto { - -enum LoggerOutputType { - VERBOSE = 0, - INFO = 1, - WARNING = 2, - ERROR = 3, - STAGE = 4, - ENUM_COUNT = 5 -}; - -const char* toString(LoggerOutputType t); -LoggerOutputType toLoggerOutputType(const std::string& str); - -constexpr int LoggerTypeCount = (int) LoggerOutputType::ENUM_COUNT; - -class ILoggerObserver { - public: - virtual ~ILoggerObserver() = default; - virtual void write(const std::string& message, LoggerOutputType ot) = 0; - virtual const std::map> extractCollectorMetadata() = 0; - virtual void reset() = 0; - virtual void addDevice(const int64_t device) = 0; - virtual void setTraceDurationMS(const int64_t duration) = 0; - virtual void addEventCount(const int64_t count) = 0; - virtual void setTraceID(const std::string&) {} - virtual void setGroupTraceID(const std::string&) {} - virtual void addDestination(const std::string& dest) = 0; - -}; - -} // namespace libkineto - -#endif // !USE_GOOGLE_LOG diff --git a/plugins/tensorboard-plugins/libkineto/include/ITraceActivity.h b/plugins/tensorboard-plugins/libkineto/include/ITraceActivity.h deleted file mode 100644 index a477ed81466..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/ITraceActivity.h +++ /dev/null @@ -1,53 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -#include "ActivityType.h" - -namespace libkineto { - -class ActivityLogger; -struct TraceSpan; - -// Generic activity interface is borrowed from tensorboard protobuf format. -struct ITraceActivity { - virtual ~ITraceActivity() {} - // Device is a physical or logical entity, e.g. CPU, GPU or process - virtual int64_t deviceId() const = 0; - // A resource is something on the device, h/w thread, - // functional units etc. - virtual int64_t resourceId() const = 0; - // s/w thread - virtual int32_t getThreadId() const = 0; - // Start timestamp in mucrosecond - virtual int64_t timestamp() const = 0; - // Duration in microseconds - virtual int64_t duration() const = 0; - // Used to link up async activities - virtual int64_t correlationId() const = 0; - // Part of a flow, identified by flow id and type - virtual int flowType() const = 0; - virtual int flowId() const = 0; - virtual bool flowStart() const = 0; - virtual ActivityType type() const = 0; - virtual const std::string name() const = 0; - // Optional linked activity - virtual const ITraceActivity* linkedActivity() const = 0; - // Optional containing trace object - virtual const TraceSpan* traceSpan() const = 0; - // Log activity - virtual void log(ActivityLogger& logger) const = 0; - // Return json formatted metadata - // FIXME: Return iterator to dynamic type map here instead - virtual const std::string metadataJson() const = 0; - - static int64_t nsToUs(int64_t ns) { - // It's important that this conversion is the same everywhere. - // No rounding! - return ns / 1000; - } -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/ThreadUtil.h b/plugins/tensorboard-plugins/libkineto/include/ThreadUtil.h deleted file mode 100644 index d1dc80ad2ab..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/ThreadUtil.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace libkineto { - -int32_t systemThreadId(); -int32_t threadId(); -bool setThreadName(const std::string& name); -std::string getThreadName(); - -int32_t processId(); -std::string processName(int32_t pid); - -// Return a list of pids and process names for the current process -// and its parents. -std::vector> pidCommandPairsOfAncestors(); - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/TraceSpan.h b/plugins/tensorboard-plugins/libkineto/include/TraceSpan.h deleted file mode 100644 index af9a9d5ee55..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/TraceSpan.h +++ /dev/null @@ -1,36 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include - -namespace libkineto { - -struct TraceSpan { - TraceSpan() = delete; - TraceSpan( - int64_t startTime, int64_t endTime, std::string name) - : startTime(startTime), endTime(endTime), name(std::move(name)) { - } - TraceSpan( - int opCount, int it, std::string name, std::string prefix) - : opCount(opCount), - iteration(it), - name(std::move(name)), - prefix(std::move(prefix)) { - } - - // FIXME: change to duration? - int64_t startTime{0}; - int64_t endTime{0}; - int opCount{0}; - int iteration{-1}; - // Name is used to identify timeline - std::string name; - // Prefix used to distinguish trace spans on the same timeline - std::string prefix; -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/libkineto.h b/plugins/tensorboard-plugins/libkineto/include/libkineto.h deleted file mode 100644 index 87c3d64f638..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/libkineto.h +++ /dev/null @@ -1,138 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -// Mediator for initialization and profiler control - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ActivityProfilerInterface.h" -#include "ActivityType.h" -#include "ClientInterface.h" -#include "GenericTraceActivity.h" -#include "TraceSpan.h" -#include "IActivityProfiler.h" -#include "ActivityTraceInterface.h" - -#include "ThreadUtil.h" - -extern "C" { - void suppressLibkinetoLogMessages(); - int InitializeInjection(void); - bool libkineto_init(bool cpuOnly, bool logOnError); -} - -namespace libkineto { - -class Config; -class ConfigLoader; - -struct CpuTraceBuffer { - TraceSpan span{0, 0, "none"}; - int gpuOpCount; - std::deque activities; -}; - -using ChildActivityProfilerFactory = - std::function()>; - -class LibkinetoApi { - public: - - explicit LibkinetoApi(ConfigLoader& configLoader) - : configLoader_(configLoader) { - } - - // Called by client that supports tracing API. - // libkineto can still function without this. - void registerClient(ClientInterface* client); - - // Called by libkineto on init - void registerProfiler(std::unique_ptr profiler) { - activityProfiler_ = std::move(profiler); - initClientIfRegistered(); - } - - ActivityProfilerInterface& activityProfiler() { - return *activityProfiler_; - } - - ClientInterface* client() { - return client_; - } - - void initProfilerIfRegistered() { - static std::once_flag once; - if (activityProfiler_) { - std::call_once(once, [this] { - if (!activityProfiler_->isInitialized()) { - activityProfiler_->init(); - initChildActivityProfilers(); - } - }); - } - } - - bool isProfilerInitialized() const { - return activityProfiler_ && activityProfiler_->isInitialized(); - } - - bool isProfilerRegistered() const { - return activityProfiler_ != nullptr; - } - - void suppressLogMessages() { - suppressLibkinetoLogMessages(); - } - - // Provides access to profier configuration manaegement - ConfigLoader& configLoader() { - return configLoader_; - } - - void registerProfilerFactory( - ChildActivityProfilerFactory factory) { - if (isProfilerInitialized()) { - activityProfiler_->addChildActivityProfiler(factory()); - } else { - childProfilerFactories_.push_back(factory); - } - } - - private: - - void initChildActivityProfilers() { - if (!isProfilerInitialized()) { - return; - } - for (const auto& factory : childProfilerFactories_) { - activityProfiler_->addChildActivityProfiler(factory()); - } - childProfilerFactories_.clear(); - } - - // Client is initialized once both it and libkineto has registered - void initClientIfRegistered(); - - ConfigLoader& configLoader_; - std::unique_ptr activityProfiler_{}; - ClientInterface* client_{}; - int32_t clientRegisterThread_{0}; - - bool isLoaded_{false}; - std::vector childProfilerFactories_; -}; - -// Singleton -LibkinetoApi& api(); - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/include/time_since_epoch.h b/plugins/tensorboard-plugins/libkineto/include/time_since_epoch.h deleted file mode 100644 index caa6b4d9276..00000000000 --- a/plugins/tensorboard-plugins/libkineto/include/time_since_epoch.h +++ /dev/null @@ -1,16 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -namespace libkineto { - -inline int64_t timeSinceEpoch( - const std::chrono::time_point& t) { - return std::chrono::duration_cast( - t.time_since_epoch()) - .count(); -} - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/libkineto_defs.bzl b/plugins/tensorboard-plugins/libkineto/libkineto_defs.bzl deleted file mode 100644 index 330c54a22df..00000000000 --- a/plugins/tensorboard-plugins/libkineto/libkineto_defs.bzl +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -def get_libkineto_api_srcs(): - return [ - "src/ThreadUtil.cpp", - "src/libkineto_api.cpp", - ] - -def get_libkineto_cupti_srcs(with_api = True): - return [ - "src/CudaDeviceProperties.cpp", - "src/CuptiActivityApi.cpp", - "src/CuptiActivityPlatform.cpp", - "src/CuptiCallbackApi.cpp", - "src/CuptiEventApi.cpp", - "src/CuptiMetricApi.cpp", - "src/CuptiRangeProfilerApi.cpp", - "src/Demangle.cpp", - "src/EventProfiler.cpp", - "src/EventProfilerController.cpp", - "src/WeakSymbols.cpp", - "src/cupti_strings.cpp", - ] + (get_libkineto_cpu_only_srcs(with_api)) - -def get_libkineto_roctracer_srcs(with_api = True): - return [ - "src/RoctracerActivityApi.cpp", - ] + (get_libkineto_cpu_only_srcs(with_api)) - -def get_libkineto_cpu_only_srcs(with_api = True): - return [ - "src/AbstractConfig.cpp", - "src/CuptiActivityProfiler.cpp", - "src/ActivityProfilerController.cpp", - "src/ActivityProfilerProxy.cpp", - "src/ActivityType.cpp", - "src/Config.cpp", - "src/ConfigLoader.cpp", - "src/CuptiActivityApi.cpp", - "src/Demangle.cpp", - "src/GenericTraceActivity.cpp", - "src/ILoggerObserver.cpp", - "src/Logger.cpp", - "src/init.cpp", - "src/output_csv.cpp", - "src/output_json.cpp", - ] + (get_libkineto_api_srcs() if with_api else []) - -def get_libkineto_public_headers(): - return [ - "include/AbstractConfig.h", - "include/ActivityProfilerInterface.h", - "include/ActivityType.h", - "include/Config.h", - "include/ClientInterface.h", - "include/GenericTraceActivity.h", - "include/GenericTraceActivity.h", - "include/IActivityProfiler.h", - "include/ILoggerObserver.h", - "include/ITraceActivity.h", - "include/TraceSpan.h", - "include/ThreadUtil.h", - "include/libkineto.h", - "include/time_since_epoch.h", - ] - -# kineto code should be updated to not have to -# suppress these warnings. -KINETO_COMPILER_FLAGS = [ - "-fexceptions", - "-Wno-deprecated-declarations", - "-Wno-unused-function", - "-Wno-unused-private-field", -] diff --git a/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cpp b/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cpp deleted file mode 100644 index 780047912ed..00000000000 --- a/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include -#include - -#include -#include - -#include "kineto/libkineto/sample_programs/kineto_playground.cuh" - -using namespace kineto; - -static const std::string kFileName = "/tmp/kineto_playground_trace.json"; - -int main() { - warmup(); - - // Kineto config - - // Empty types set defaults to all types - std::set types; - - auto& profiler = libkineto::api().activityProfiler(); - libkineto::api().initProfilerIfRegistered(); - profiler.prepareTrace(types); - - // Good to warm up after prepareTrace to get cupti initialization to settle - warmup(); - profiler.startTrace(); - playground(); - - auto trace = profiler.stopTrace(); - LOG(INFO) << "Stopped and processed trace. Got " << trace->activities()->size() << " activities."; - trace->save(kFileName); - return 0; -} - diff --git a/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cu b/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cu deleted file mode 100644 index 54c6f82ff4b..00000000000 --- a/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cu +++ /dev/null @@ -1,60 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include - -#include "kineto_playground.cuh" - - -namespace kineto { - -void warmup(void) { - // Inititalizing CUDA can take a while which we normally do not want to see in Kineto traces. - // This is done in various ways that take Kineto as dependency. This is our way of doing warmup - // for kineto_playground - size_t bytes = 1000; - float* mem = NULL; - auto error = cudaMalloc(&mem, bytes); - if (error != cudaSuccess) { - printf("cudaMalloc failed during kineto_playground warmup. error code: %d", error); - return; - } - - cudaFree(mem); -} - -void basicMemcpyMemset(void) { - size_t size = (1 << 8) * sizeof(float); - float *hostMemSrc, *deviceMem, *hostMemDst; - cudaError_t err; - - hostMemSrc = (float*)malloc(size); - hostMemDst = (float*)malloc(size); - err = cudaMalloc(&deviceMem, size); - if (err != cudaSuccess) { - printf("cudaMalloc failed during %s", __func__); - return; - } - - memset(hostMemSrc, 1, size); - cudaMemcpy(deviceMem, hostMemSrc, size, cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - printf("cudaMemcpy failed during %s", __func__); - return; - } - - cudaMemcpy(hostMemDst, deviceMem, size, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - printf("cudaMemcpy failed during %s", __func__); - return; - } - - free(hostMemSrc); - free(hostMemDst); - cudaFree(deviceMem); -} - -void playground(void) { - // Add your experimental CUDA implementation here. -} - -} diff --git a/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cuh b/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cuh deleted file mode 100644 index 54e1ee59ada..00000000000 --- a/plugins/tensorboard-plugins/libkineto/sample_programs/kineto_playground.cuh +++ /dev/null @@ -1,18 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -namespace kineto { - -// Warms up CUDA before the tracing starts -void warmup(void); - -// Basic usage of cudaMemcpy and cudaMemset -void basicMemcpyMemset(void); - -// Your experimental code goes in here! -void playground(void); - -} diff --git a/plugins/tensorboard-plugins/libkineto/src/AbstractConfig.cpp b/plugins/tensorboard-plugins/libkineto/src/AbstractConfig.cpp deleted file mode 100644 index d60ab43c9a3..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/AbstractConfig.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "AbstractConfig.h" - -#include -#include -#include - -#include "Logger.h" - -using namespace std::chrono; - -using std::string; -using std::vector; - -namespace KINETO_NAMESPACE { - -constexpr char kWhitespace[] = "\t\n "; - -static bool isWhitespace(string& s) { - return s.find_first_not_of(kWhitespace) == string::npos; -} - -// Remove whitespace from both end of string -static inline string trim(string& s) { - if (s.empty()) { - return s; - } else if (isWhitespace(s)) { - return ""; - } - auto start = s.find_first_not_of(kWhitespace); - auto end = s.find_last_not_of(kWhitespace); - return s.substr(start, end - start + 1); -} - -// Helper function for split. -// Return the index of char d in string s. -// If not found, returns the length of the string. -static int find(const char* s, char delim) { - int i; - for (i = 0; s[i]; i++) { - if (s[i] == delim) { - break; - } - } - return i; -} - -// Split a string by delimiter -static vector split(const string& s, char delim) { - vector res; - const char* cs = s.c_str(); - for (int i = find(cs, delim); cs[i]; cs += i + 1, i = find(cs, delim)) { - res.emplace_back(cs, i); - } - res.emplace_back(cs); - return res; -} - -// Remove a trailing comment. -static inline string stripComment(const string& s) { - std::size_t pos = s.find("#"); - return s.substr(0, pos); -} - -string AbstractConfig::toLower(string& s) const { - string res = s; - for (int i = 0; i < res.size(); i++) { - if (res[i] >= 'A' && res[i] <= 'Z') { - res[i] += ('a' - 'A'); - } - } - return res; -} - -bool AbstractConfig::endsWith(const string& s, const string& suffix) const { - if (suffix.size() > s.size()) { - return false; - } - return s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0; -} - -vector AbstractConfig::splitAndTrim(const string& s, char delim) const { - auto res = split(s, delim); - for (string& x : res) { - x = trim(x); - } - return res; -} - -int64_t AbstractConfig::toIntRange(const string& val, int64_t min, int64_t max) - const { - char* invalid; - int64_t res = strtoll(val.c_str(), &invalid, 10); - if (val.empty() || *invalid) { - throw std::invalid_argument(fmt::format("Invalid integer: {}", val)); - } else if (res < min || res > max) { - throw std::invalid_argument(fmt::format( - "Invalid argument: {} - expected range [{}, {}]", res, min, max)); - } - return res; -} - -int32_t AbstractConfig::toInt32(const string& val) const { - return toIntRange(val, 0, ~0u / 2); -} - -int64_t AbstractConfig::toInt64(const string& val) const { - return toIntRange(val, 0, ~0ul / 2); -} - -bool AbstractConfig::toBool(string& val) const { - const std::array bool_vals{ - "n", "y", "no", "yes", "f", "t", "false", "true"}; - const string lower_val = toLower(val); - for (int i = 0; i < bool_vals.size(); i++) { - if (lower_val == bool_vals[i]) { - return i % 2; - } - } - throw std::invalid_argument(fmt::format("Invalid bool argument: {}", val)); - return false; -} - -bool AbstractConfig::parse(const string& conf) { - std::istringstream iss(conf); - string line; - - timestamp_ = system_clock::now(); - - // Read the string stream 1 line at a time to parse. - while (std::getline(iss, line)) { - line = stripComment(line); - if (isWhitespace(line)) { - continue; - } - vector key_val = splitAndTrim(line, '='); - if (key_val.size() != 2) { - LOG(ERROR) << "Invalid config line: " << line; - return false; - } else { - bool handled = false; - try { - handled = handleOption(key_val[0], key_val[1]); - if (!handled) { - for (auto& feature_cfg : featureConfigs_) { - if (feature_cfg.second->handleOption(key_val[0], key_val[1])) { - handled = true; - break; - } - } - } - } catch (const std::exception& e) { - LOG(ERROR) << "Failed to parse config line: " << line; - LOG(ERROR) << e.what(); - return false; - } - if (!handled) { - // This might be due to using a newer config option on an - // older binary where it is not supported. In this case, - // print a warning message - but it is expected to work! - LOG(WARNING) << "Unrecognized config line: " << line; - } - } - } - - validate(timestamp_); - - // Store original text, used to detect updates - source_ = conf; - timestamp_ = system_clock::now(); - return true; -} - -bool AbstractConfig::handleOption( - const std::string& /* unused */, - std::string& /* unused */) { - LOG(ERROR) << "handleOption unimplemented"; - return false; -} - -void AbstractConfig::printActivityProfilerConfig(std::ostream& s) const { - for (const auto& feature_cfg : featureConfigs_) { - feature_cfg.second->printActivityProfilerConfig(s); - } -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityBuffers.h b/plugins/tensorboard-plugins/libkineto/src/ActivityBuffers.h deleted file mode 100644 index 157af879379..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityBuffers.h +++ /dev/null @@ -1,29 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - - -#include -#include - -#include "libkineto.h" -#include "CuptiActivityBuffer.h" - -namespace KINETO_NAMESPACE { - -struct ActivityBuffers { - std::list> cpu; - std::unique_ptr gpu; - - // Add a wrapper object to the underlying struct stored in the buffer - template - const ITraceActivity& addActivityWrapper(const T& act) { - wrappers_.push_back(std::make_unique(act)); - return *wrappers_.back().get(); - } - - private: - std::vector> wrappers_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityLoggerFactory.h b/plugins/tensorboard-plugins/libkineto/src/ActivityLoggerFactory.h deleted file mode 100644 index 0d1bf642cd6..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityLoggerFactory.h +++ /dev/null @@ -1,60 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -class ActivityLogger; - -class ActivityLoggerFactory { - - public: - using FactoryFunc = - std::function(const std::string& url)>; - - // Add logger factory for a protocol prefix - void addProtocol(const std::string& protocol, FactoryFunc f) { - factories_[tolower(protocol)] = f; - } - - // Create a logger, invoking the factory for the protocol specified in url - std::unique_ptr makeLogger(const std::string& url) const { - std::string protocol = extractProtocol(url); - auto it = factories_.find(tolower(protocol)); - if (it != factories_.end()) { - return it->second(stripProtocol(url)); - } - throw std::invalid_argument(fmt::format( - "No logger registered for the {} protocol prefix", - protocol)); - return nullptr; - } - - private: - static std::string tolower(std::string s) { - std::transform(s.begin(), s.end(), s.begin(), - [](unsigned char c) { return std::tolower(c); } - ); - return s; - } - - static std::string extractProtocol(std::string url) { - return url.substr(0, url.find("://")); - } - - static std::string stripProtocol(std::string url) { - size_t pos = url.find("://"); - return pos == url.npos ? url : url.substr(pos + 3); - } - - std::map factories_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.cpp b/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.cpp deleted file mode 100644 index c85d41ed73f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.cpp +++ /dev/null @@ -1,246 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "ActivityProfilerController.h" - -#include -#include - -#include "ActivityLoggerFactory.h" -#include "ActivityTrace.h" -#include "CuptiActivityApi.h" -#ifdef HAS_ROCTRACER -#include "RoctracerActivityApi.h" -#endif -#include "ThreadUtil.h" -#include "output_json.h" -#include "output_membuf.h" - -#include "Logger.h" - -using namespace std::chrono; - -namespace KINETO_NAMESPACE { - -constexpr milliseconds kProfilerIntervalMsecs(1000); - -ActivityProfilerController::ActivityProfilerController( - ConfigLoader& configLoader, bool cpuOnly) - : configLoader_(configLoader) { -#ifdef HAS_ROCTRACER - profiler_ = std::make_unique( - RoctracerActivityApi::singleton(), cpuOnly); -#else - profiler_ = std::make_unique( - CuptiActivityApi::singleton(), cpuOnly); -#endif - configLoader_.addHandler(ConfigLoader::ConfigKind::ActivityProfiler, this); -} - -ActivityProfilerController::~ActivityProfilerController() { - configLoader_.removeHandler( - ConfigLoader::ConfigKind::ActivityProfiler, this); - if (profilerThread_) { - // signaling termination of the profiler loop - stopRunloop_ = true; - profilerThread_->join(); - delete profilerThread_; - profilerThread_ = nullptr; - } -} - -static ActivityLoggerFactory initLoggerFactory() { - ActivityLoggerFactory factory; - factory.addProtocol("file", [](const std::string& url) { - return std::unique_ptr(new ChromeTraceLogger(url)); - }); - return factory; -} - -static ActivityLoggerFactory& loggerFactory() { - static ActivityLoggerFactory factory = initLoggerFactory(); - return factory; -} - -void ActivityProfilerController::addLoggerFactory( - const std::string& protocol, ActivityLoggerFactory::FactoryFunc factory) { - loggerFactory().addProtocol(protocol, factory); -} - -static std::unique_ptr makeLogger(const Config& config) { - if (config.activitiesLogToMemory()) { - return std::make_unique(config); - } - return loggerFactory().makeLogger(config.activitiesLogUrl()); -} - -bool ActivityProfilerController::canAcceptConfig() { - return !profiler_->isActive(); -} - -void ActivityProfilerController::acceptConfig(const Config& config) { - VLOG(1) << "acceptConfig"; - if (config.activityProfilerEnabled()) { - scheduleTrace(config); - } -} - -void ActivityProfilerController::profilerLoop() { - setThreadName("Kineto Activity Profiler"); - VLOG(0) << "Entering activity profiler loop"; - - auto now = system_clock::now(); - auto next_wakeup_time = now + kProfilerIntervalMsecs; - - while (!stopRunloop_) { - now = system_clock::now(); - - while (now < next_wakeup_time) { - /* sleep override */ - std::this_thread::sleep_for(next_wakeup_time - now); - now = system_clock::now(); - } - - if (!profiler_->isActive()) { - std::lock_guard lock(asyncConfigLock_); - if (asyncRequestConfig_ - && !asyncRequestConfig_->hasProfileStartIteration()) { - // Note on now + kProfilerIntervalMsecs - // Profiler interval does not align perfectly upto startTime - warmup. Waiting until the next tick - // won't allow sufficient time for the profiler to warm up. So check if we are very close to the warmup time and trigger warmup - if (now + kProfilerIntervalMsecs - >= (asyncRequestConfig_->requestTimestamp() - asyncRequestConfig_->activitiesWarmupDuration())) { - LOG(INFO) << "Received on-demand activity trace request by " - << " profile timestamp = " - << asyncRequestConfig_-> - requestTimestamp().time_since_epoch().count(); - activateConfig(now); - } - } - } - - while (next_wakeup_time < now) { - next_wakeup_time += kProfilerIntervalMsecs; - } - - if (profiler_->isActive()) { - next_wakeup_time = profiler_->performRunLoopStep(now, next_wakeup_time); - VLOG(1) << "Profiler loop: " - << duration_cast(system_clock::now() - now).count() - << "ms"; - } - } - - VLOG(0) << "Exited activity profiling loop"; -} - -void ActivityProfilerController::step() { - int64_t currentIter = ++iterationCount_; - VLOG(0) << "Step called , iteration = " << currentIter; - - // optimization to not take the lock unless necessary - if (asyncRequestConfig_ && !profiler_->isActive()) { - std::lock_guard lock(asyncConfigLock_); - auto startIter = asyncRequestConfig_->startIterationIncludingWarmup(); - - if (asyncRequestConfig_->hasProfileStartIteration() - && currentIter >= startIter) { - LOG(INFO) << "Received on-demand activity trace request by profile" - << " start iteration = " - << asyncRequestConfig_->profileStartIteration() - << " current iteration = " << currentIter; - - if (currentIter > startIter) { - // adjust the start iteration if it is in the past - auto newProfileStart = currentIter + - asyncRequestConfig_->activitiesWarmupIterations(); - LOG(INFO) << "Start iteration updated to " << newProfileStart; - asyncRequestConfig_->setProfileStartIteration(newProfileStart); - } - activateConfig(system_clock::now()); - } - } - - if (profiler_->isActive()) { - auto now = system_clock::now(); - auto next_wakeup_time = now + kProfilerIntervalMsecs; - profiler_->performRunLoopStep(now, next_wakeup_time, currentIter); - } -} - -void ActivityProfilerController::activateConfig( - std::chrono::time_point now) { - logger_ = makeLogger(*asyncRequestConfig_); - profiler_->setLogger(logger_.get()); - profiler_->configure(*asyncRequestConfig_, now); - asyncRequestConfig_ = nullptr; -} - -void ActivityProfilerController::scheduleTrace(const Config& config) { - VLOG(1) << "scheduleTrace"; - if (profiler_->isActive()) { - LOG(ERROR) << "Ignored request - profiler busy"; - return; - } - int64_t currentIter = iterationCount_; - if (config.hasProfileStartIteration() && currentIter < 0) { - LOG(ERROR) << "Ignored profile iteration count based request as " - << "application is not updating iteration count"; - return; - } - std::lock_guard lock(asyncConfigLock_); - asyncRequestConfig_ = config.clone(); - - auto startIter = asyncRequestConfig_->startIterationIncludingWarmup(); - - if (asyncRequestConfig_->hasProfileStartIteration() - && (currentIter > startIter) - && asyncRequestConfig_->profileStartIterationRoundUp() > 0) { - auto newProfileStart - = currentIter + asyncRequestConfig_->activitiesWarmupIterations(); - // round up to nearest multiple - auto divisor = asyncRequestConfig_->profileStartIterationRoundUp(); - auto rem = newProfileStart % divisor; - newProfileStart += ((rem == 0) ? 0 : divisor - rem); - LOG(INFO) << "Rounding up profiler start iteration to : " << newProfileStart; - asyncRequestConfig_->setProfileStartIteration(newProfileStart); - } - - // start a profilerLoop() thread to handle request - if (!profilerThread_) { - profilerThread_ = - new std::thread(&ActivityProfilerController::profilerLoop, this); - } -} - -void ActivityProfilerController::prepareTrace(const Config& config) { - // Requests from ActivityProfilerApi have higher priority than - // requests from other sources (signal, daemon). - // Cancel any ongoing request and refuse new ones. - auto now = system_clock::now(); - if (profiler_->isActive()) { - LOG(WARNING) << "Cancelling current trace request in order to start " - << "higher priority synchronous request"; - if (libkineto::api().client()) { - libkineto::api().client()->stop(); - } - profiler_->stopTrace(now); - profiler_->reset(); - } - - profiler_->configure(config, now); -} - -std::unique_ptr ActivityProfilerController::stopTrace() { - profiler_->stopTrace(std::chrono::system_clock::now()); - auto logger = std::make_unique(profiler_->config()); - profiler_->processTrace(*logger); - profiler_->reset(); - return std::make_unique(std::move(logger), loggerFactory()); -} - -void ActivityProfilerController::addMetadata( - const std::string& key, const std::string& value) { - profiler_->addMetadata(key, value); -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.h b/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.h deleted file mode 100644 index 415f107cbed..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerController.h +++ /dev/null @@ -1,84 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include - -#include "ActivityLoggerFactory.h" -#include "CuptiActivityProfiler.h" -#include "ActivityProfilerInterface.h" -#include "ActivityTraceInterface.h" -#include "ConfigLoader.h" -#include "CuptiActivityApi.h" - -namespace KINETO_NAMESPACE { - -class Config; - -class ActivityProfilerController : public ConfigLoader::ConfigHandler { - public: - explicit ActivityProfilerController(ConfigLoader& configLoader, bool cpuOnly); - ActivityProfilerController(const ActivityProfilerController&) = delete; - ActivityProfilerController& operator=(const ActivityProfilerController&) = - delete; - - ~ActivityProfilerController(); - - static void addLoggerFactory( - const std::string& protocol, - ActivityLoggerFactory::FactoryFunc factory); - - bool canAcceptConfig() override; - void acceptConfig(const Config& config) override; - - void scheduleTrace(const Config& config); - - void prepareTrace(const Config& config); - - void startTrace() { - profiler_->startTrace(std::chrono::system_clock::now()); - } - - void step(); - - std::unique_ptr stopTrace(); - - bool isActive() { - return profiler_->isActive(); - } - - void transferCpuTrace( - std::unique_ptr cpuTrace) { - return profiler_->transferCpuTrace(std::move(cpuTrace)); - } - - void recordThreadInfo() { - profiler_->recordThreadInfo(); - } - - void addChildActivityProfiler( - std::unique_ptr profiler) { - profiler_->addChildActivityProfiler(std::move(profiler)); - } - - void addMetadata(const std::string& key, const std::string& value); - - private: - void profilerLoop(); - void activateConfig(std::chrono::time_point now); - - std::unique_ptr asyncRequestConfig_; - std::mutex asyncConfigLock_; - std::unique_ptr profiler_; - std::unique_ptr logger_; - std::thread* profilerThread_{nullptr}; - std::atomic_bool stopRunloop_{false}; - std::atomic iterationCount_{-1}; - ConfigLoader& configLoader_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.cpp b/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.cpp deleted file mode 100644 index b2d36b7b3ab..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "ActivityProfilerProxy.h" - -#include "ActivityProfilerController.h" -#include "Config.h" -#include "CuptiActivityApi.h" -#include "Logger.h" -#include - -namespace KINETO_NAMESPACE { - -ActivityProfilerProxy::ActivityProfilerProxy( - bool cpuOnly, ConfigLoader& configLoader) - : cpuOnly_(cpuOnly), configLoader_(configLoader) { -} - -ActivityProfilerProxy::~ActivityProfilerProxy() { - delete controller_; -}; - -void ActivityProfilerProxy::init() { - if (!controller_) { - controller_ = new ActivityProfilerController(configLoader_, cpuOnly_); - } -} - -void ActivityProfilerProxy::scheduleTrace(const std::string& configStr) { - Config config; - config.parse(configStr); - controller_->scheduleTrace(config); -} - -void ActivityProfilerProxy::scheduleTrace(const Config& config) { - controller_->scheduleTrace(config); -} - -void ActivityProfilerProxy::prepareTrace( - const std::set& activityTypes, - const std::string& configStr) { - Config config; - bool validate_required = true; - - // allow user provided config to override default options - if (!configStr.empty()) { - if (!config.parse(configStr)) { - LOG(WARNING) << "Failed to parse config : " << configStr; - } - // parse also runs validate - validate_required = false; - } - - config.setClientDefaults(); - config.setSelectedActivityTypes(activityTypes); - - if (validate_required) { - config.validate(std::chrono::system_clock::now()); - } - - controller_->prepareTrace(config); -} - -void ActivityProfilerProxy::startTrace() { - controller_->startTrace(); -} - -std::unique_ptr -ActivityProfilerProxy::stopTrace() { - return controller_->stopTrace(); -} - -void ActivityProfilerProxy::step() { - controller_->step(); -} - -bool ActivityProfilerProxy::isActive() { - return controller_->isActive(); -} - -void ActivityProfilerProxy::pushCorrelationId(uint64_t id) { - CuptiActivityApi::pushCorrelationID(id, - CuptiActivityApi::CorrelationFlowType::Default); -} - -void ActivityProfilerProxy::popCorrelationId() { - CuptiActivityApi::popCorrelationID( - CuptiActivityApi::CorrelationFlowType::Default); -} - -void ActivityProfilerProxy::pushUserCorrelationId(uint64_t id) { - CuptiActivityApi::pushCorrelationID(id, - CuptiActivityApi::CorrelationFlowType::User); -} - -void ActivityProfilerProxy::popUserCorrelationId() { - CuptiActivityApi::popCorrelationID( - CuptiActivityApi::CorrelationFlowType::User); -} - -void ActivityProfilerProxy::transferCpuTrace( - std::unique_ptr traceBuffer) { - controller_->transferCpuTrace(std::move(traceBuffer)); -} - -void ActivityProfilerProxy::addMetadata( - const std::string& key, const std::string& value) { - controller_->addMetadata(key, value); -} - -void ActivityProfilerProxy::recordThreadInfo() { - controller_->recordThreadInfo(); -} - -void ActivityProfilerProxy::addChildActivityProfiler( - std::unique_ptr profiler) { - controller_->addChildActivityProfiler(std::move(profiler)); -} - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.h b/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.h deleted file mode 100644 index b5cf84b2f1d..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityProfilerProxy.h +++ /dev/null @@ -1,73 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include "ActivityProfilerInterface.h" - -#include -#include -#include - -#include "ActivityType.h" -#include "ITraceActivity.h" - -namespace libkineto { - // previous declaration is struct so this one must be too. - struct CpuTraceBuffer; -} - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -class ActivityProfilerController; -class Config; -class ConfigLoader; - -class ActivityProfilerProxy : public ActivityProfilerInterface { - - public: - ActivityProfilerProxy(bool cpuOnly, ConfigLoader& configLoader); - ~ActivityProfilerProxy() override; - - void init() override; - bool isInitialized() override { - return controller_ != nullptr; - } - - bool isActive() override; - - void recordThreadInfo() override; - - void scheduleTrace(const std::string& configStr) override; - void scheduleTrace(const Config& config); - - void prepareTrace( - const std::set& activityTypes, - const std::string& configStr = "") override; - - void startTrace() override; - void step() override; - std::unique_ptr stopTrace() override; - - void pushCorrelationId(uint64_t id) override; - void popCorrelationId() override; - - void pushUserCorrelationId(uint64_t id) override; - void popUserCorrelationId() override; - - void transferCpuTrace( - std::unique_ptr traceBuffer) override; - - void addMetadata(const std::string& key, const std::string& value) override; - - virtual void addChildActivityProfiler( - std::unique_ptr profiler) override; - - private: - bool cpuOnly_{true}; - ConfigLoader& configLoader_; - ActivityProfilerController* controller_{nullptr}; -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityTrace.h b/plugins/tensorboard-plugins/libkineto/src/ActivityTrace.h deleted file mode 100644 index 0be76af08e4..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityTrace.h +++ /dev/null @@ -1,45 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include - -#include "ActivityLoggerFactory.h" -#include "ActivityTraceInterface.h" -#include "output_json.h" -#include "output_membuf.h" - -namespace libkineto { - -class ActivityTrace : public ActivityTraceInterface { - public: - ActivityTrace( - std::unique_ptr tmpLogger, - const ActivityLoggerFactory& factory) - : memLogger_(std::move(tmpLogger)), - loggerFactory_(factory) { - } - - const std::vector* activities() override { - return memLogger_->traceActivities(); - }; - - void save(const std::string& url) override { - std::string prefix; - // if no protocol is specified, default to file - if (url.find("://") == url.npos) { - prefix = "file://"; - } - memLogger_->log(*loggerFactory_.makeLogger(prefix + url)); - }; - - private: - // Activities are logged into a buffer - std::unique_ptr memLogger_; - - // Alternative logger used by save() if protocol prefix is specified - const ActivityLoggerFactory& loggerFactory_; -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/ActivityType.cpp b/plugins/tensorboard-plugins/libkineto/src/ActivityType.cpp deleted file mode 100644 index 18856b72370..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ActivityType.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "ActivityType.h" - -#include - -namespace libkineto { - -struct ActivityTypeName { - const char* name; - ActivityType type; -}; - -static constexpr std::array map{{ - {"cpu_op", ActivityType::CPU_OP}, - {"user_annotation", ActivityType::USER_ANNOTATION}, - {"gpu_user_Annotation", ActivityType::GPU_USER_ANNOTATION}, - {"gpu_memcpy", ActivityType::GPU_MEMCPY}, - {"gpu_memset", ActivityType::GPU_MEMSET}, - {"kernel", ActivityType::CONCURRENT_KERNEL}, - {"external_correlation", ActivityType::EXTERNAL_CORRELATION}, - {"cuda_runtime", ActivityType::CUDA_RUNTIME}, - {"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE}, - {"glow_runtime", ActivityType::GLOW_RUNTIME}, - {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT}, - {"python_function", ActivityType::PYTHON_FUNCTION}, - {"overhead", ActivityType::OVERHEAD}, - {"ENUM_COUNT", ActivityType::ENUM_COUNT} -}}; - -static constexpr bool matchingOrder(int idx = 0) { - return map[idx].type == ActivityType::ENUM_COUNT || - ((idx == (int) map[idx].type) && matchingOrder(idx + 1)); -} -static_assert(matchingOrder(), "ActivityTypeName map is out of order"); - -const char* toString(ActivityType t) { - return map[(int)t].name; -} - -ActivityType toActivityType(const std::string& str) { - for (int i = 0; i < activityTypeCount; i++) { - if (str == map[i].name) { - return map[i].type; - } - } - throw std::invalid_argument(fmt::format("Invalid activity type: {}", str)); -} - -const std::array activityTypes() { - std::array res; - for (int i = 0; i < activityTypeCount; i++) { - res[i] = map[i].type; - } - return res; -} - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/Config.cpp b/plugins/tensorboard-plugins/libkineto/src/Config.cpp deleted file mode 100644 index 95538840f37..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/Config.cpp +++ /dev/null @@ -1,473 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "Config.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Logger.h" -#include "ThreadUtil.h" - -using namespace std::chrono; - -using std::string; -using std::vector; - -namespace KINETO_NAMESPACE { - -constexpr milliseconds kDefaultSamplePeriodMsecs(1000); -constexpr milliseconds kDefaultMultiplexPeriodMsecs(1000); -constexpr milliseconds kDefaultActivitiesProfileDurationMSecs(500); -constexpr int kDefaultActivitiesMaxGpuBufferSize(128 * 1024 * 1024); -constexpr seconds kDefaultActivitiesWarmupDurationSecs(5); -constexpr seconds kDefaultBufferUntilWarmup(10); -constexpr seconds kDefaultReportPeriodSecs(1); -constexpr int kDefaultSamplesPerReport(1); -constexpr int kDefaultMaxEventProfilersPerGpu(1); -constexpr int kDefaultEventProfilerHearbeatMonitorPeriod(0); -constexpr seconds kMaxRequestAge(10); - -// Event Profiler -constexpr char kEventsKey[] = "EVENTS"; -constexpr char kMetricsKey[] = "METRICS"; -constexpr char kSamplePeriodKey[] = "SAMPLE_PERIOD_MSECS"; -constexpr char kMultiplexPeriodKey[] = "MULTIPLEX_PERIOD_MSECS"; -constexpr char kReportPeriodKey[] = "REPORT_PERIOD_SECS"; -constexpr char kSamplesPerReportKey[] = "SAMPLES_PER_REPORT"; -constexpr char kEventsLogFileKey[] = "EVENTS_LOG_FILE"; -constexpr char kEventsEnabledDevicesKey[] = "EVENTS_ENABLED_DEVICES"; -constexpr char kOnDemandDurationKey[] = "EVENTS_DURATION_SECS"; -constexpr char kMaxEventProfilersPerGpuKey[] = "MAX_EVENT_PROFILERS_PER_GPU"; -constexpr char kHeartbeatMonitorPeriodKey[] = - "EVENTS_HEARTBEAT_MONITOR_PERIOD_SECS"; - -// Activity Profiler -constexpr char kActivitiesEnabledKey[] = "ACTIVITIES_ENABLED"; -constexpr char kActivityTypesKey[] = "ACTIVITY_TYPES"; -constexpr char kActivitiesLogFileKey[] = "ACTIVITIES_LOG_FILE"; -constexpr char kActivitiesDurationKey[] = "ACTIVITIES_DURATION_SECS"; -constexpr char kActivitiesDurationMsecsKey[] = "ACTIVITIES_DURATION_MSECS"; -constexpr char kActivitiesWarmupDurationSecsKey[] = "ACTIVITIES_WARMUP_PERIOD_SECS"; -constexpr char kActivitiesMaxGpuBufferSizeKey[] = - "ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB"; - -// Client Interface -constexpr char kClientInterfaceEnableOpInputsCollection[] = "CLIENT_INTERFACE_ENABLE_OP_INPUTS_COLLECTION"; - -constexpr char kActivitiesWarmupIterationsKey[] = "ACTIVITIES_WARMUP_ITERATIONS"; -constexpr char kActivitiesIterationsKey[] = "ACTIVITIES_ITERATIONS"; -// Common - -// Client-side timestamp used for synchronized start across hosts for -// distributed workloads. -// Specified in milliseconds Unix time (milliseconds since epoch). -// To use, compute a future timestamp as follows: -// * C++: + duration_cast( -// system_clock::now().time_since_epoch()).count() -// * Python: + int(time.time() * 1000) -// * Bash: $(( + $(date +%s%3N))) -// If used for a tracing request, timestamp must be far enough in the future -// to accommodate ACTIVITIES_WARMUP_PERIOD_SECS as well as any delays in -// propagating the request to the profiler. -// If the request can not be honored, it is up to the profilers to report -// an error somehow - no checks are done at config parse time. -// Note PROFILE_START_ITERATION has higher precedence -constexpr char kProfileStartTimeKey[] = "PROFILE_START_TIME"; -// DEPRECATED - USE PROFILE_START_TIME instead -constexpr char kRequestTimestampKey[] = "REQUEST_TIMESTAMP"; - -// Alternatively if the application supports reporting iterations -// start the profile at specific iteration. If the iteration count -// is >= this value the profile is started immediately. -// A value >= 0 is valid for this config option to take effect. -// Note PROFILE_START_ITERATION will take precedence over PROFILE_START_TIME. -constexpr char kProfileStartIterationKey[] = "PROFILE_START_ITERATION"; - -// Users can also start the profile on an integer multiple of the config -// value PROFILE_START_ITERATION_ROUNDUP. This knob behaves similar to -// PROFILE_START_ITERATION but instead of saying : "start collection trace on -// iteration 500", one can configure it to "start collecting trace on the next -// 100th iteration". -// -// For example, -// PROFILE_START_ITERATION_ROUNDUP = 1000, and the current iteration is 2010 -// The profile will then be collected on the next multiple of 1000 ie. 3000 -// Note PROFILE_START_ITERATION_ROUNDUP will also take precedence over -// PROFILE_START_TIME. -constexpr char kProfileStartIterationRoundUpKey[] - = "PROFILE_START_ITERATION_ROUNDUP"; - -// Enable on-demand trigger via kill -USR2 -// When triggered in this way, /tmp/libkineto.conf will be used as config. -constexpr char kEnableSigUsr2Key[] = "ENABLE_SIGUSR2"; - -// Enable communication through IPC Fabric -// and disable thrift communication with dynolog daemon -constexpr char kEnableIpcFabricKey[] = "ENABLE_IPC_FABRIC"; - -// Verbose log level -// The actual glog is not used and --v and --vmodule has no effect. -// Instead set the verbose level and modules in the config file. -constexpr char kLogVerboseLevelKey[] = "VERBOSE_LOG_LEVEL"; -// By default, all modules will log verbose messages >= verboseLogLevel. -// But to reduce noise we can specify one or more modules of interest. -// A module is a C/C++ object file (source file name), -// Example argument: ActivityProfiler.cpp,output_json.cpp -constexpr char kLogVerboseModulesKey[] = "VERBOSE_LOG_MODULES"; - -// Max devices supported on any system -constexpr uint8_t kMaxDevices = 8; - -namespace { - -struct FactoryMap { - - void addFactory( - std::string name, - std::function factory) { - std::lock_guard lock(lock_); - factories_[name] = factory; - } - - void addFeatureConfigs(Config& cfg) { - std::lock_guard lock(lock_); - for (const auto& p : factories_) { - cfg.addFeature(p.first, p.second(cfg)); - } - } - -// Config factories are shared between objects and since -// config objects can be created by multiple threads, we need a lock. - std::mutex lock_; - std::map> factories_; -}; - -std::shared_ptr configFactories() { - // Ensure this is safe to call during shutdown, even as static - // destructors are invoked. Once factories destructor has been - // invoked, weak_ptr.lock() will return nullptr. - // But calls before that point will have a valid shared_ptr, - // delaying destruction of the underlying FactoryMap. - static auto factories = std::make_shared(); - static std::weak_ptr weak_ptr = factories; - return weak_ptr.lock(); -} - -} // namespace - -void Config::addConfigFactory( - std::string name, - std::function factory) { - auto factories = configFactories(); - if (factories) { - factories->addFactory(name, factory); - } -} - -static string defaultTraceFileName() { - return fmt::format("/tmp/libkineto_activities_{}.json", processId()); -} - -Config::Config() - : verboseLogLevel_(-1), - samplePeriod_(kDefaultSamplePeriodMsecs), - reportPeriod_(duration_cast(kDefaultReportPeriodSecs)), - samplesPerReport_(kDefaultSamplesPerReport), - eventProfilerOnDemandDuration_(seconds(0)), - eventProfilerMaxInstancesPerGpu_(kDefaultMaxEventProfilersPerGpu), - eventProfilerHeartbeatMonitorPeriod_( - kDefaultEventProfilerHearbeatMonitorPeriod), - multiplexPeriod_(kDefaultMultiplexPeriodMsecs), - activityProfilerEnabled_(true), - activitiesLogFile_(defaultTraceFileName()), - activitiesLogUrl_(fmt::format("file://{}", activitiesLogFile_)), - activitiesMaxGpuBufferSize_(kDefaultActivitiesMaxGpuBufferSize), - activitiesWarmupDuration_(kDefaultActivitiesWarmupDurationSecs), - activitiesWarmupIterations_(0), - activitiesDuration_(kDefaultActivitiesProfileDurationMSecs), - activitiesRunIterations_(0), - activitiesOnDemandTimestamp_(milliseconds(0)), - profileStartTime_(milliseconds(0)), - profileStartIteration_(-1), - profileStartIterationRoundUp_(-1), - requestTimestamp_(milliseconds(0)), - enableSigUsr2_(false), - enableIpcFabric_(false) { - auto factories = configFactories(); - if (factories) { - factories->addFeatureConfigs(*this); - } -} - -uint8_t Config::createDeviceMask(const string& val) { - uint8_t res = 0; - for (const auto& d : splitAndTrim(val, ',')) { - res |= 1 << toIntRange(d, 0, kMaxDevices - 1); - } - return res; -} - -const seconds Config::maxRequestAge() const { - return kMaxRequestAge; -} - -static std::string getTimeStr(time_point t) { - std::time_t t_c = system_clock::to_time_t(t); - return fmt::format("{:%H:%M:%S}", fmt::localtime(t_c)); -} - -static time_point handleRequestTimestamp(int64_t ms) { - auto t = time_point(milliseconds(ms)); - auto now = system_clock::now(); - if (t > now) { - throw std::invalid_argument(fmt::format( - "Invalid {}: {} - time is in future", - kRequestTimestampKey, - getTimeStr(t))); - } else if ((now - t) > kMaxRequestAge) { - throw std::invalid_argument(fmt::format( - "Invalid {}: {} - time is more than {}s in the past", - kRequestTimestampKey, - getTimeStr(t), - kMaxRequestAge.count())); - } - return t; -} - -void Config::setActivityTypes( - const std::vector& selected_activities) { - selectedActivityTypes_.clear(); - if (selected_activities.size() > 0) { - for (const auto& activity : selected_activities) { - if (activity == "") { - continue; - } - selectedActivityTypes_.insert(toActivityType(activity)); - } - } -} - -bool Config::handleOption(const std::string& name, std::string& val) { - // Event Profiler - if (!name.compare(kEventsKey)) { - vector event_names = splitAndTrim(val, ','); - eventNames_.insert(event_names.begin(), event_names.end()); - } else if (!name.compare(kMetricsKey)) { - vector metric_names = splitAndTrim(val, ','); - metricNames_.insert(metric_names.begin(), metric_names.end()); - } else if (!name.compare(kSamplePeriodKey)) { - samplePeriod_ = milliseconds(toInt32(val)); - } else if (!name.compare(kMultiplexPeriodKey)) { - multiplexPeriod_ = milliseconds(toInt32(val)); - } else if (!name.compare(kReportPeriodKey)) { - setReportPeriod(seconds(toInt32(val))); - } else if (!name.compare(kSamplesPerReportKey)) { - samplesPerReport_ = toInt32(val); - } else if (!name.compare(kEventsLogFileKey)) { - eventLogFile_ = val; - } else if (!name.compare(kEventsEnabledDevicesKey)) { - eventProfilerDeviceMask_ = createDeviceMask(val); - } else if (!name.compare(kOnDemandDurationKey)) { - eventProfilerOnDemandDuration_ = seconds(toInt32(val)); - eventProfilerOnDemandTimestamp_ = timestamp(); - } else if (!name.compare(kMaxEventProfilersPerGpuKey)) { - eventProfilerMaxInstancesPerGpu_ = toInt32(val); - } else if (!name.compare(kHeartbeatMonitorPeriodKey)) { - eventProfilerHeartbeatMonitorPeriod_ = seconds(toInt32(val)); - } - - // Activity Profiler - else if (!name.compare(kActivitiesDurationKey)) { - activitiesDuration_ = - duration_cast(seconds(toInt32(val))); - activitiesOnDemandTimestamp_ = timestamp(); - } else if (!name.compare(kActivityTypesKey)) { - vector activity_types = splitAndTrim(toLower(val), ','); - setActivityTypes(activity_types); - } else if (!name.compare(kActivitiesDurationMsecsKey)) { - activitiesDuration_ = milliseconds(toInt32(val)); - activitiesOnDemandTimestamp_ = timestamp(); - } else if (!name.compare(kActivitiesIterationsKey)) { - activitiesRunIterations_ = toInt32(val); - activitiesOnDemandTimestamp_ = timestamp(); - } else if (!name.compare(kLogVerboseLevelKey)) { - verboseLogLevel_ = toInt32(val); - } else if (!name.compare(kLogVerboseModulesKey)) { - verboseLogModules_ = splitAndTrim(val, ','); - } else if (!name.compare(kActivitiesEnabledKey)) { - activityProfilerEnabled_ = toBool(val); - } else if (!name.compare(kActivitiesLogFileKey)) { - activitiesLogFile_ = val; - activitiesLogUrl_ = fmt::format("file://{}", val); - activitiesOnDemandTimestamp_ = timestamp(); - } else if (!name.compare(kActivitiesMaxGpuBufferSizeKey)) { - activitiesMaxGpuBufferSize_ = toInt32(val) * 1024 * 1024; - } else if (!name.compare(kActivitiesWarmupDurationSecsKey)) { - activitiesWarmupDuration_ = seconds(toInt32(val)); - } else if (!name.compare(kActivitiesWarmupIterationsKey)) { - activitiesWarmupIterations_ = toInt32(val); - } - - // Client Interface - else if (!name.compare(kClientInterfaceEnableOpInputsCollection)) { - enableOpInputsCollection_ = toBool(val); - } - - // Common - else if (!name.compare(kRequestTimestampKey)) { - VLOG(0) << kRequestTimestampKey - << " has been deprecated - please use " - << kProfileStartTimeKey; - requestTimestamp_ = handleRequestTimestamp(toInt64(val)); - } else if (!name.compare(kProfileStartTimeKey)) { - profileStartTime_ = - time_point(milliseconds(toInt64(val))); - } else if (!name.compare(kProfileStartIterationKey)) { - profileStartIteration_ = toInt32(val); - } else if (!name.compare(kProfileStartIterationRoundUpKey)) { - profileStartIterationRoundUp_ = toInt32(val); - } else if (!name.compare(kEnableSigUsr2Key)) { - enableSigUsr2_ = toBool(val); - } else if (!name.compare(kEnableIpcFabricKey)) { - enableIpcFabric_ = toBool(val); - } else { - return false; - } - return true; -} - -std::chrono::milliseconds Config::activitiesDurationDefault() const { - return kDefaultActivitiesProfileDurationMSecs; -}; - -void Config::updateActivityProfilerRequestReceivedTime() { - activitiesOnDemandTimestamp_ = system_clock::now(); -} - -void Config::setClientDefaults() { - AbstractConfig::setClientDefaults(); - activitiesLogToMemory_ = true; -} - -void Config::validate( - const time_point& fallbackProfileStartTime) { - if (samplePeriod_.count() == 0) { - LOG(WARNING) << "Sample period must be greater than 0, setting to 1ms"; - samplePeriod_ = milliseconds(1); - } - - if (multiplexPeriod_ < samplePeriod_) { - LOG(WARNING) << "Multiplex period can not be smaller " - << "than sample period"; - LOG(WARNING) << "Setting multiplex period to " << samplePeriod_.count() - << "ms"; - multiplexPeriod_ = samplePeriod_; - } - - if ((multiplexPeriod_ % samplePeriod_).count() != 0) { - LOG(WARNING) << "Multiplex period must be a " - << "multiple of sample period"; - multiplexPeriod_ = alignUp(multiplexPeriod_, samplePeriod_); - LOG(WARNING) << "Setting multiplex period to " << multiplexPeriod_.count() - << "ms"; - } - - if ((reportPeriod_ % multiplexPeriod_).count() != 0 || - reportPeriod_.count() == 0) { - LOG(WARNING) << "Report period must be a " - << "multiple of multiplex period"; - reportPeriod_ = alignUp(reportPeriod_, multiplexPeriod_); - LOG(WARNING) << "Setting report period to " << reportPeriod_.count() - << "ms"; - } - - if (samplesPerReport_ < 1) { - LOG(WARNING) << "Samples per report must be in the range " - << "[1, report period / sample period]"; - LOG(WARNING) << "Setting samples per report to 1"; - samplesPerReport_ = 1; - } - - int max_samples_per_report = reportPeriod_ / samplePeriod_; - if (samplesPerReport_ > max_samples_per_report) { - LOG(WARNING) << "Samples per report must be in the range " - << "[1, report period / sample period] ([1, " - << reportPeriod_.count() << "ms / " << samplePeriod_.count() - << "ms = " << max_samples_per_report << "])"; - LOG(WARNING) << "Setting samples per report to " << max_samples_per_report; - samplesPerReport_ = max_samples_per_report; - } - - if (!hasProfileStartTime()) { - VLOG(0) - << "No explicit timestamp has been set. " - << "Defaulting it to now + activitiesWarmupDuration with buffer."; - profileStartTime_ = fallbackProfileStartTime + - activitiesWarmupDuration() + kDefaultBufferUntilWarmup; - } - - if (profileStartIterationRoundUp_ == 0) { - // setting to 0 will mess up modulo arithmetic, set it to -1 so it has no effect - LOG(WARNING) << "Profiler start iteration round up should be >= 1."; - profileStartIterationRoundUp_ = -1; - } - - if (profileStartIterationRoundUp_ > 0 && !hasProfileStartIteration()) { - VLOG(0) << "Setting profiler start iteration to 0 so this config is " - << "triggered via iteration count."; - profileStartIteration_ = 0; - } - - if (selectedActivityTypes_.size() == 0) { - selectDefaultActivityTypes(); - } -} - -void Config::setReportPeriod(milliseconds msecs) { - reportPeriod_ = msecs; -} - -void Config::printActivityProfilerConfig(std::ostream& s) const { - s << "Log file: " << activitiesLogFile() << std::endl; - if (hasProfileStartIteration()) { - s << "Trace start Iteration: " << profileStartIteration() << std::endl; - s << "Trace warmup Iterations: " << activitiesWarmupIterations() << std::endl; - s << "Trace profile Iterations: " << activitiesRunIterations() << std::endl; - if (profileStartIterationRoundUp() > 0) { - s << "Trace start iteration roundup : " << profileStartIterationRoundUp() - << std::endl; - } - } else if (hasProfileStartTime()) { - std::time_t t_c = system_clock::to_time_t(requestTimestamp()); - LOG(INFO) << "Trace start time: " - << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(t_c)); - s << "Trace duration: " << activitiesDuration().count() << "ms" - << std::endl; - s << "Warmup duration: " << activitiesWarmupDuration().count() << "s" - << std::endl; - } - - s << "Max GPU buffer size: " << activitiesMaxGpuBufferSize() / 1024 / 1024 - << "MB" << std::endl; - - std::vector activities; - for (const auto& activity : selectedActivityTypes_) { - activities.push_back(toString(activity)); - } - s << "Enabled activities: " - << fmt::format("{}", fmt::join(activities, ",")) << std::endl; - - AbstractConfig::printActivityProfilerConfig(s); -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ConfigLoader.cpp b/plugins/tensorboard-plugins/libkineto/src/ConfigLoader.cpp deleted file mode 100644 index 4080b678d37..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ConfigLoader.cpp +++ /dev/null @@ -1,300 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "ConfigLoader.h" - -#ifdef __linux__ -#include -#endif - -#include -#include -#include -#include -#include - -#include "DaemonConfigLoader.h" - -#include "Logger.h" - -using namespace std::chrono; -using std::string; - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -constexpr char kConfigFileEnvVar[] = "KINETO_CONFIG"; -#ifdef __linux__ -constexpr char kConfigFile[] = "/etc/libkineto.conf"; -constexpr char kOnDemandConfigFile[] = "/tmp/libkineto.conf"; -#else -constexpr char kConfigFile[] = "libkineto.conf"; -constexpr char kOnDemandConfigFile[] = "libkineto.conf"; -#endif - -constexpr std::chrono::seconds kConfigUpdateIntervalSecs(300); -constexpr std::chrono::seconds kOnDemandConfigUpdateIntervalSecs(5); - -#ifdef __linux__ -static struct sigaction originalUsr2Handler = {}; -#endif - -// Use SIGUSR2 to initiate profiling. -// Look for an on-demand config file. -// If none is found, default to base config. -// Try to not affect existing handlers -static bool hasOriginalSignalHandler() { -#ifdef __linux__ - return originalUsr2Handler.sa_handler != nullptr || - originalUsr2Handler.sa_sigaction != nullptr; -#else - return false; -#endif -} - -static void handle_signal(int signal) { -#ifdef __linux__ - if (signal == SIGUSR2) { - ConfigLoader::instance().handleOnDemandSignal(); - if (hasOriginalSignalHandler()) { - // Invoke original handler and reinstate ours - struct sigaction act; - sigaction(SIGUSR2, &originalUsr2Handler, &act); - raise(SIGUSR2); - sigaction(SIGUSR2, &act, &originalUsr2Handler); - } - } -#endif -} - -static void setupSignalHandler(bool enableSigUsr2) { -#ifdef __linux__ - if (enableSigUsr2) { - struct sigaction act = {}; - act.sa_handler = &handle_signal; - act.sa_flags = SA_NODEFER; - if (sigaction(SIGUSR2, &act, &originalUsr2Handler) < 0) { - PLOG(ERROR) << "Failed to register SIGUSR2 handler"; - } - if (originalUsr2Handler.sa_handler == &handle_signal) { - originalUsr2Handler = {}; - } - } else if (hasOriginalSignalHandler()) { - sigaction(SIGUSR2, &originalUsr2Handler, nullptr); - originalUsr2Handler = {}; - } -#endif -} - -// return an empty string if reading gets any errors. Otherwise a config string. -static std::string readConfigFromConfigFile(const char* filename) { - // Read whole file into a string. - std::ifstream file(filename); - std::string conf; - try { - conf.assign( - std::istreambuf_iterator(file), std::istreambuf_iterator()); - } catch (std::exception& e) { - VLOG(0) << "Error reading " << filename << ": " - << e.what(); - conf = ""; - } - return conf; -} - -static std::function()>& -daemonConfigLoaderFactory() { - static std::function()> factory = nullptr; - return factory; -} - -void ConfigLoader::setDaemonConfigLoaderFactory( - std::function()> factory) { - daemonConfigLoaderFactory() = factory; -} - -ConfigLoader& ConfigLoader::instance() { - static ConfigLoader config_loader; - return config_loader; -} - -// return an empty string if polling gets any errors. Otherwise a config string. -std::string ConfigLoader::readOnDemandConfigFromDaemon( - time_point now) { - if (!daemonConfigLoader_) { - return ""; - } - bool events = canHandlerAcceptConfig(ConfigKind::EventProfiler); - bool activities = canHandlerAcceptConfig(ConfigKind::ActivityProfiler); - return daemonConfigLoader_->readOnDemandConfig(events, activities); -} - -int ConfigLoader::contextCountForGpu(uint32_t device) { - if (!daemonConfigLoader_) { - // FIXME: Throw error? - return 0; - } - return daemonConfigLoader_->gpuContextCount(device); -} - -ConfigLoader::ConfigLoader() - : configUpdateIntervalSecs_(kConfigUpdateIntervalSecs), - onDemandConfigUpdateIntervalSecs_(kOnDemandConfigUpdateIntervalSecs), - stopFlag_(false), - onDemandSignal_(false) { -} - -void ConfigLoader::startThread() { - if (!updateThread_) { - // Create default base config here - at this point static initializers - // of extensions should have run and registered all config feature factories - std::lock_guard lock(configLock_); - if (!config_) { - config_ = std::make_unique(); - } - updateThread_ = - std::make_unique(&ConfigLoader::updateConfigThread, this); - } -} - -ConfigLoader::~ConfigLoader() { - if (updateThread_) { - stopFlag_ = true; - { - std::lock_guard lock(updateThreadMutex_); - updateThreadCondVar_.notify_one(); - } - updateThread_->join(); - } -#if !USE_GOOGLE_LOG - Logger::clearLoggerObservers(); -#endif // !USE_GOOGLE_LOG -} - -void ConfigLoader::handleOnDemandSignal() { - onDemandSignal_ = true; - { - std::lock_guard lock(updateThreadMutex_); - updateThreadCondVar_.notify_one(); - } -} - -const char* ConfigLoader::configFileName() { - if (!configFileName_) { - configFileName_ = getenv(kConfigFileEnvVar); - if (configFileName_ == nullptr) { - configFileName_ = kConfigFile; - } - } - return configFileName_; -} - -DaemonConfigLoader* ConfigLoader::daemonConfigLoader() { - if (!daemonConfigLoader_ && daemonConfigLoaderFactory()) { - daemonConfigLoader_ = daemonConfigLoaderFactory()(); - daemonConfigLoader_->setCommunicationFabric(config_->ipcFabricEnabled()); - } - return daemonConfigLoader_.get(); -} - -void ConfigLoader::updateBaseConfig() { - // First try reading local config file - // If that fails, read from daemon - // TODO: Invert these once daemon path fully rolled out - std::string config_str = readConfigFromConfigFile(configFileName()); - if (config_str.empty() && daemonConfigLoader()) { - // If local config file was not successfully loaded (e.g. not found) - // then try the daemon - config_str = daemonConfigLoader()->readBaseConfig(); - } - if (config_str != config_->source()) { - std::lock_guard lock(configLock_); - config_ = std::make_unique(); - config_->parse(config_str); - if (daemonConfigLoader()) { - daemonConfigLoader()->setCommunicationFabric(config_->ipcFabricEnabled()); - } - setupSignalHandler(config_->sigUsr2Enabled()); - SET_LOG_VERBOSITY_LEVEL( - config_->verboseLogLevel(), - config_->verboseLogModules()); - VLOG(0) << "Detected base config change"; - } -} - -void ConfigLoader::configureFromSignal( - time_point now, - Config& config) { - LOG(INFO) << "Received on-demand profiling signal, " - << "reading config from " << kOnDemandConfigFile; - // Reset start time to 0 in order to compute new default start time - const std::string config_str = "PROFILE_START_TIME=0\n" - + readConfigFromConfigFile(kOnDemandConfigFile); - config.parse(config_str); - config.setSignalDefaults(); - notifyHandlers(config); -} - -void ConfigLoader::configureFromDaemon( - time_point now, - Config& config) { - const std::string config_str = readOnDemandConfigFromDaemon(now); - if (config_str.empty()) { - return; - } - - LOG(INFO) << "Received config from dyno:\n" << config_str; - config.parse(config_str); - notifyHandlers(config); -} - -void ConfigLoader::updateConfigThread() { - auto now = system_clock::now(); - auto next_config_load_time = now; - auto next_on_demand_load_time = now + onDemandConfigUpdateIntervalSecs_; - seconds interval = configUpdateIntervalSecs_; - if (interval > onDemandConfigUpdateIntervalSecs_) { - interval = onDemandConfigUpdateIntervalSecs_; - } - auto onDemandConfig = std::make_unique(); - - // This can potentially sleep for long periods of time, so allow - // the desctructor to wake it to avoid a 5-minute long destruct period. - for (;;) { - { - std::unique_lock lock(updateThreadMutex_); - updateThreadCondVar_.wait_for(lock, interval); - } - if (stopFlag_) { - break; - } - now = system_clock::now(); - if (now > next_config_load_time) { - updateBaseConfig(); - next_config_load_time = now + configUpdateIntervalSecs_; - } - if (onDemandSignal_.exchange(false)) { - onDemandConfig = config_->clone(); - configureFromSignal(now, *onDemandConfig); - } else if (now > next_on_demand_load_time) { - onDemandConfig = std::make_unique(); - configureFromDaemon(now, *onDemandConfig); - next_on_demand_load_time = now + onDemandConfigUpdateIntervalSecs_; - } - if (onDemandConfig->verboseLogLevel() >= 0) { - LOG(INFO) << "Setting verbose level to " - << onDemandConfig->verboseLogLevel() - << " from on-demand config"; - SET_LOG_VERBOSITY_LEVEL( - onDemandConfig->verboseLogLevel(), - onDemandConfig->verboseLogModules()); - } - } -} - -bool ConfigLoader::hasNewConfig(const Config& oldConfig) { - std::lock_guard lock(configLock_); - return config_->timestamp() > oldConfig.timestamp(); -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ConfigLoader.h b/plugins/tensorboard-plugins/libkineto/src/ConfigLoader.h deleted file mode 100644 index 4ce3468e48d..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ConfigLoader.h +++ /dev/null @@ -1,147 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "Config.h" - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "ILoggerObserver.h" - -namespace libkineto { - class LibkinetoApi; -} - -namespace KINETO_NAMESPACE { - -using namespace libkineto; -class DaemonConfigLoader; - -class ConfigLoader { - public: - - static ConfigLoader& instance(); - - enum ConfigKind { - ActivityProfiler = 0, - EventProfiler, - NumConfigKinds - }; - - struct ConfigHandler { - virtual ~ConfigHandler() {} - virtual bool canAcceptConfig() = 0; - virtual void acceptConfig(const Config& cfg) = 0; - }; - - void addHandler(ConfigKind kind, ConfigHandler* handler) { - std::lock_guard lock(updateThreadMutex_); - handlers_[kind].push_back(handler); - startThread(); - } - - void removeHandler(ConfigKind kind, ConfigHandler* handler) { - std::lock_guard lock(updateThreadMutex_); - auto it = std::find( - handlers_[kind].begin(), handlers_[kind].end(), handler); - if (it != handlers_[kind].end()) { - handlers_[kind].erase(it); - } - } - - void notifyHandlers(const Config& cfg) { - std::lock_guard lock(updateThreadMutex_); - for (auto& key_val : handlers_) { - for (ConfigHandler* handler : key_val.second) { - handler->acceptConfig(cfg); - } - } - } - - bool canHandlerAcceptConfig(ConfigKind kind) { - std::lock_guard lock(updateThreadMutex_); - for (ConfigHandler* handler : handlers_[kind]) { - if (!handler->canAcceptConfig()) { - return false; - } - } - return true; - } - - void initBaseConfig() { - bool init = false; - { - std::lock_guard lock(configLock_); - init = !config_ || config_->source().empty(); - } - if (init) { - updateBaseConfig(); - } - } - - inline std::unique_ptr getConfigCopy() { - std::lock_guard lock(configLock_); - return config_->clone(); - } - - bool hasNewConfig(const Config& oldConfig); - int contextCountForGpu(uint32_t gpu); - - void handleOnDemandSignal(); - - static void setDaemonConfigLoaderFactory( - std::function()> factory); - - private: - ConfigLoader(); - ~ConfigLoader(); - - const char* configFileName(); - DaemonConfigLoader* daemonConfigLoader(); - - void startThread(); - void updateConfigThread(); - void updateBaseConfig(); - - // Create configuration when receiving SIGUSR2 - void configureFromSignal( - std::chrono::time_point now, - Config& config); - - // Create configuration when receiving request from a daemon - void configureFromDaemon( - std::chrono::time_point now, - Config& config); - - std::string readOnDemandConfigFromDaemon( - std::chrono::time_point now); - - std::mutex configLock_; - std::atomic configFileName_{nullptr}; - std::unique_ptr config_; - std::unique_ptr daemonConfigLoader_; - std::map> handlers_; - - std::chrono::seconds configUpdateIntervalSecs_; - std::chrono::seconds onDemandConfigUpdateIntervalSecs_; - std::unique_ptr updateThread_; - std::condition_variable updateThreadCondVar_; - std::mutex updateThreadMutex_; - std::atomic_bool stopFlag_{false}; - std::atomic_bool onDemandSignal_{false}; - -#if !USE_GOOGLE_LOG - std::unique_ptr> loggerObservers_; - std::mutex loggerObserversMutex_; -#endif // !USE_GOOGLE_LOG -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.cpp b/plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.cpp deleted file mode 100644 index 1e909d5f9cf..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) Kineto Contributors - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "CudaDeviceProperties.h" - -#include -#include - -#include -#include - -#include "Logger.h" - -namespace KINETO_NAMESPACE { - -static const std::vector createDeviceProps() { - std::vector props; - int device_count; - cudaError_t error_id = cudaGetDeviceCount(&device_count); - // Return empty vector if error. - if (error_id != cudaSuccess) { - LOG(ERROR) << "cudaGetDeviceCount failed with code " << error_id; - return {}; - } - VLOG(0) << "Device count is " << device_count; - for (size_t i = 0; i < device_count; ++i) { - cudaDeviceProp prop; - error_id = cudaGetDeviceProperties(&prop, i); - // Return empty vector if any device property fail to get. - if (error_id != cudaSuccess) { - LOG(ERROR) << "cudaGetDeviceProperties failed with " << error_id; - return {}; - } - props.push_back(prop); - LOGGER_OBSERVER_ADD_DEVICE(i); - } - return props; -} - -static const std::vector& deviceProps() { - static const std::vector props = createDeviceProps(); - return props; -} - -static const std::string createDevicePropertiesJson( - size_t id, const cudaDeviceProp& props) { - return fmt::format(R"JSON( - {{ - "id": {}, "name": "{}", "totalGlobalMem": {}, - "computeMajor": {}, "computeMinor": {}, - "maxThreadsPerBlock": {}, "maxThreadsPerMultiprocessor": {}, - "regsPerBlock": {}, "regsPerMultiprocessor": {}, "warpSize": {}, - "sharedMemPerBlock": {}, "sharedMemPerMultiprocessor": {}, - "numSms": {}, "sharedMemPerBlockOptin": {} - }})JSON", - id, props.name, props.totalGlobalMem, - props.major, props.minor, - props.maxThreadsPerBlock, props.maxThreadsPerMultiProcessor, - props.regsPerBlock, props.regsPerMultiprocessor, props.warpSize, - props.sharedMemPerBlock, props.sharedMemPerMultiprocessor, - props.multiProcessorCount, props.sharedMemPerBlockOptin); -} - -static const std::string createDevicePropertiesJson() { - std::vector jsonProps; - const auto& props = deviceProps(); - for (size_t i = 0; i < props.size(); i++) { - jsonProps.push_back(createDevicePropertiesJson(i, props[i])); - } - return fmt::format("{}", fmt::join(jsonProps, ",")); -} - -const std::string& devicePropertiesJson() { - static std::string devicePropsJson = createDevicePropertiesJson(); - return devicePropsJson; -} - -int smCount(uint32_t deviceId) { - const std::vector &props = deviceProps(); - return deviceId >= props.size() ? 0 : - props[deviceId].multiProcessorCount; -} - -float kernelOccupancy( - uint32_t deviceId, - uint16_t registersPerThread, - int32_t staticSharedMemory, - int32_t dynamicSharedMemory, - int32_t blockX, - int32_t blockY, - int32_t blockZ, - float blocksPerSm) { - // Calculate occupancy - float occupancy = -1.0; - const std::vector &props = deviceProps(); - if (deviceId < props.size()) { - cudaOccFuncAttributes occFuncAttr; - occFuncAttr.maxThreadsPerBlock = INT_MAX; - occFuncAttr.numRegs = registersPerThread; - occFuncAttr.sharedSizeBytes = staticSharedMemory; - occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF; - occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT; - occFuncAttr.maxDynamicSharedSizeBytes = 0; - const cudaOccDeviceState occDeviceState = {}; - int blockSize = blockX * blockY * blockZ; - size_t dynamicSmemSize = dynamicSharedMemory; - cudaOccResult occ_result; - cudaOccDeviceProp prop(props[deviceId]); - cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor( - &occ_result, &prop, &occFuncAttr, &occDeviceState, - blockSize, dynamicSmemSize); - if (status == CUDA_OCC_SUCCESS) { - if (occ_result.activeBlocksPerMultiprocessor < blocksPerSm) { - blocksPerSm = occ_result.activeBlocksPerMultiprocessor; - } - occupancy = blocksPerSm * blockSize / - (float) props[deviceId].maxThreadsPerMultiProcessor; - } else { - LOG_EVERY_N(ERROR, 1000) << "Failed to calculate occupancy, status = " - << status; - } - } - return occupancy; -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.h b/plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.h deleted file mode 100644 index b731fde0c2a..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CudaDeviceProperties.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) Kineto Contributors - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace KINETO_NAMESPACE { - -int smCount(uint32_t deviceId); - -// Return estimated achieved occupancy for a kernel -float kernelOccupancy( - uint32_t deviceId, - uint16_t registersPerThread, - int32_t staticSharedMemory, - int32_t dynamicSharedMemory, - int32_t blockX, - int32_t blockY, - int32_t blockZ, - float blocks_per_sm); - -// Return compute properties for each device as a json string -const std::string& devicePropertiesJson(); - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivity.h b/plugins/tensorboard-plugins/libkineto/src/CuptiActivity.h deleted file mode 100644 index 09c29504060..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivity.h +++ /dev/null @@ -1,114 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -#include "ITraceActivity.h" -#include "CuptiActivityPlatform.h" -#include "ThreadUtil.h" -#include "cupti_strings.h" - -namespace libkineto { - class ActivityLogger; -} - -namespace KINETO_NAMESPACE { - -using namespace libkineto; -struct TraceSpan; - -// These classes wrap the various CUPTI activity types -// into subclasses of ITraceActivity so that they can all be accessed -// using the ITraceActivity interface and logged via ActivityLogger. - -// Abstract base class, templated on Cupti activity type -template -struct CuptiActivity : public ITraceActivity { - explicit CuptiActivity(const T* activity, const ITraceActivity* linked) - : activity_(*activity), linked_(linked) {} - int64_t timestamp() const override { - return nsToUs(unixEpochTimestamp(activity_.start)); - } - int64_t duration() const override { - return nsToUs(activity_.end - activity_.start); - } - // TODO(T107507796): Deprecate ITraceActivity - int64_t correlationId() const override {return 0;} - int32_t getThreadId() const override {return 0;} - const ITraceActivity* linkedActivity() const override {return linked_;} - int flowType() const override {return kLinkAsyncCpuGpu;} - int flowId() const override {return correlationId();} - const T& raw() const {return activity_;} - const TraceSpan* traceSpan() const override {return nullptr;} - - protected: - const T& activity_; - const ITraceActivity* linked_{nullptr}; -}; - -// CUpti_ActivityAPI - CUDA runtime activities -struct RuntimeActivity : public CuptiActivity { - explicit RuntimeActivity( - const CUpti_ActivityAPI* activity, - const ITraceActivity* linked, - int32_t threadId) - : CuptiActivity(activity, linked), threadId_(threadId) {} - int64_t correlationId() const override {return activity_.correlationId;} - int64_t deviceId() const override {return processId();} - int64_t resourceId() const override {return threadId_;} - ActivityType type() const override {return ActivityType::CUDA_RUNTIME;} - bool flowStart() const override; - const std::string name() const override {return runtimeCbidName(activity_.cbid);} - void log(ActivityLogger& logger) const override; - const std::string metadataJson() const override; - - private: - const int32_t threadId_; -}; - -// CUpti_ActivityAPI - CUDA runtime activities -struct OverheadActivity : public CuptiActivity { - explicit OverheadActivity( - const CUpti_ActivityOverhead* activity, - const ITraceActivity* linked, - int32_t threadId=0) - : CuptiActivity(activity, linked), threadId_(threadId) {} - - int64_t timestamp() const override { - return nsToUs(unixEpochTimestamp(activity_.start)); - } - int64_t duration() const override { - return nsToUs(activity_.end - activity_.start); - } - // TODO: Update this with PID ordering - int64_t deviceId() const override {return -1;} - int64_t resourceId() const override {return threadId_;} - ActivityType type() const override {return ActivityType::OVERHEAD;} - bool flowStart() const override; - const std::string name() const override {return overheadKindString(activity_.overheadKind);} - void log(ActivityLogger& logger) const override; - const std::string metadataJson() const override; - - private: - const int32_t threadId_; -}; - -// Base class for GPU activities. -// Can also be instantiated directly. -template -struct GpuActivity : public CuptiActivity { - explicit GpuActivity(const T* activity, const ITraceActivity* linked) - : CuptiActivity(activity, linked) {} - int64_t correlationId() const override {return raw().correlationId;} - int64_t deviceId() const override {return raw().deviceId;} - int64_t resourceId() const override {return raw().streamId;} - ActivityType type() const override; - bool flowStart() const override {return false;} - const std::string name() const override; - void log(ActivityLogger& logger) const override; - const std::string metadataJson() const override; - const T& raw() const {return CuptiActivity::raw();} -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivity.tpp b/plugins/tensorboard-plugins/libkineto/src/CuptiActivity.tpp deleted file mode 100644 index 1ff2dafe06b..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivity.tpp +++ /dev/null @@ -1,111 +0,0 @@ - /* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "CuptiActivity.h" - -#include - -#include "Demangle.h" -#include "output_base.h" - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -template<> -inline const std::string GpuActivity::name() const { - return demangle(raw().name); -} - -template<> -inline ActivityType GpuActivity::type() const { - return ActivityType::CONCURRENT_KERNEL; -} - -static inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) { - return fmt::format( - "Memcpy {} ({} -> {})", - memcpyKindString((CUpti_ActivityMemcpyKind)kind), - memoryKindString((CUpti_ActivityMemoryKind)src), - memoryKindString((CUpti_ActivityMemoryKind)dst)); -} - -template<> -inline ActivityType GpuActivity::type() const { - return ActivityType::GPU_MEMCPY; -} - -template<> -inline const std::string GpuActivity::name() const { - return memcpyName(raw().copyKind, raw().srcKind, raw().dstKind); -} - -template<> -inline ActivityType GpuActivity::type() const { - return ActivityType::GPU_MEMCPY; -} - -template<> -inline const std::string GpuActivity::name() const { - return memcpyName(raw().copyKind, raw().srcKind, raw().dstKind); -} - -template<> -inline const std::string GpuActivity::name() const { - const char* memory_kind = - memoryKindString((CUpti_ActivityMemoryKind)raw().memoryKind); - return fmt::format("Memset ({})", memory_kind); -} - -template<> -inline ActivityType GpuActivity::type() const { - return ActivityType::GPU_MEMSET; -} - -inline void RuntimeActivity::log(ActivityLogger& logger) const { - logger.handleActivity(*this); -} - -inline void OverheadActivity::log(ActivityLogger& logger) const { - logger.handleActivity(*this); -} - -inline bool OverheadActivity::flowStart() const { - return false; -} - -inline const std::string OverheadActivity::metadataJson() const { - return ""; -} - -template -inline void GpuActivity::log(ActivityLogger& logger) const { - logger.handleGpuActivity(*this); -} - -inline bool RuntimeActivity::flowStart() const { - return activity_.cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 || - (activity_.cbid >= CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 && - activity_.cbid <= CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020) || - activity_.cbid == - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000 || - activity_.cbid == - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000; -} - -inline const std::string RuntimeActivity::metadataJson() const { - return fmt::format(R"JSON( - "cbid": {}, "correlation": {})JSON", - activity_.cbid, activity_.correlationId); -} - -template -inline const std::string GpuActivity::metadataJson() const { - return ""; -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.cpp deleted file mode 100644 index 5718bed2f89..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.cpp +++ /dev/null @@ -1,343 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "CuptiActivityApi.h" - -#include -#include - -#include "cupti_call.h" -#include "Logger.h" - -using namespace std::chrono; - -namespace KINETO_NAMESPACE { - -// TODO: do we want this to be configurable? -// Set to 2MB to avoid constantly creating buffers (espeically for networks -// that has many small memcpy such as sparseNN) -// Consider putting this on huge pages? -constexpr size_t kBufSize(2 * 1024 * 1024); - -CuptiActivityApi& CuptiActivityApi::singleton() { - static CuptiActivityApi instance; - return instance; -} - -void CuptiActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { -#ifdef HAS_CUPTI - if (!singleton().externalCorrelationEnabled_) { - return; - } - VLOG(2) << "pushCorrelationID(" << id << ")"; - switch(type) { - case Default: - CUPTI_CALL(cuptiActivityPushExternalCorrelationId( - CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, id)); - break; - case User: - CUPTI_CALL(cuptiActivityPushExternalCorrelationId( - CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, id)); - } -#endif -} - -void CuptiActivityApi::popCorrelationID(CorrelationFlowType type) { -#ifdef HAS_CUPTI - if (!singleton().externalCorrelationEnabled_) { - return; - } - switch(type) { - case Default: - CUPTI_CALL(cuptiActivityPopExternalCorrelationId( - CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, nullptr)); - break; - case User: - CUPTI_CALL(cuptiActivityPopExternalCorrelationId( - CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, nullptr)); - } -#endif -} - -static int getSMCount() { -#ifdef HAS_CUPTI - // There may be a simpler way to get the number of SMs.... - // Look for domain_d - this has 80 instances on Volta and - // 56 instances on Pascal, corresponding to the number of SMs - // FIXME: This does not work on Turing and later - uint32_t domainCount{0}; - CUPTI_CALL(cuptiDeviceGetNumEventDomains(0, &domainCount)); - std::vector ids(domainCount); - size_t sz = sizeof(CUpti_EventDomainID) * domainCount; - CUPTI_CALL(cuptiDeviceEnumEventDomains(0, &sz, ids.data())); - for (CUpti_EventDomainID id : ids) { - char name[16]; - name[0] = '\0'; - sz = sizeof(name); - CUPTI_CALL(cuptiEventDomainGetAttribute( - id, CUPTI_EVENT_DOMAIN_ATTR_NAME, &sz, name)); - if (strncmp(name, "domain_d", sz) == 0) { - uint32_t count{0}; - sz = sizeof(count); - CUPTI_CALL(cuptiDeviceGetEventDomainAttribute( - 0, id, CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, &sz, &count)); - return count; - } - } -#endif - - return -1; -} - -int CuptiActivityApi::smCount() { - static int sm_count = getSMCount(); - return sm_count; -} - -static bool nextActivityRecord( - uint8_t* buffer, - size_t valid_size, - CUpti_Activity*& record) { -#ifdef HAS_CUPTI - CUptiResult status = CUPTI_CALL_NOWARN( - cuptiActivityGetNextRecord(buffer, valid_size, &record)); - if (status != CUPTI_SUCCESS) { - if (status != CUPTI_ERROR_MAX_LIMIT_REACHED) { - CUPTI_CALL(status); - } - record = nullptr; - } -#endif - return record != nullptr; -} - -void CuptiActivityApi::setMaxBufferSize(int size) { - maxGpuBufferCount_ = 1 + size / kBufSize; -} - -void CuptiActivityApi::forceLoadCupti() { -#ifdef HAS_CUPTI - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); -#endif -} - -#ifdef HAS_CUPTI -void CUPTIAPI CuptiActivityApi::bufferRequestedTrampoline( - uint8_t** buffer, - size_t* size, - size_t* maxNumRecords) { - singleton().bufferRequested(buffer, size, maxNumRecords); -} - -void CuptiActivityApi::bufferRequested( - uint8_t** buffer, size_t* size, size_t* maxNumRecords) { - std::lock_guard guard(mutex_); - if (allocatedGpuTraceBuffers_.size() >= maxGpuBufferCount_) { - stopCollection = true; - LOG(WARNING) << "Exceeded max GPU buffer count (" - << allocatedGpuTraceBuffers_.size() - << " > " << maxGpuBufferCount_ - << ") - terminating tracing"; - } - - auto buf = std::make_unique(kBufSize); - *buffer = buf->data(); - *size = kBufSize; - - allocatedGpuTraceBuffers_[*buffer] = std::move(buf); - - *maxNumRecords = 0; -} -#endif - -std::unique_ptr -CuptiActivityApi::activityBuffers() { - { - std::lock_guard guard(mutex_); - if (allocatedGpuTraceBuffers_.empty()) { - return nullptr; - } - } - -#ifdef HAS_CUPTI - VLOG(1) << "Flushing GPU activity buffers"; - time_point t1; - if (VLOG_IS_ON(1)) { - t1 = system_clock::now(); - } - // Can't hold mutex_ during this call, since bufferCompleted - // will be called by libcupti and mutex_ is acquired there. - CUPTI_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)); - if (VLOG_IS_ON(1)) { - flushOverhead = - duration_cast(system_clock::now() - t1).count(); - } -#endif - std::lock_guard guard(mutex_); - // Transfer ownership of buffers to caller. A new map is created on-demand. - return std::move(readyGpuTraceBuffers_); -} - -#ifdef HAS_CUPTI -int CuptiActivityApi::processActivitiesForBuffer( - uint8_t* buf, - size_t validSize, - std::function handler) { - int count = 0; - if (buf && validSize) { - CUpti_Activity* record{nullptr}; - while ((nextActivityRecord(buf, validSize, record))) { - handler(record); - ++count; - } - } - return count; -} -#endif - -const std::pair CuptiActivityApi::processActivities( - CuptiActivityBufferMap& buffers, - std::function handler) { - std::pair res{0, 0}; -#ifdef HAS_CUPTI - for (auto& pair : buffers) { - // No lock needed - only accessed from this thread - auto& buf = pair.second; - res.first += processActivitiesForBuffer(buf->data(), buf->size(), handler); - res.second += buf->size(); - } -#endif - return res; -} - -void CuptiActivityApi::clearActivities() { - { - std::lock_guard guard(mutex_); - if (allocatedGpuTraceBuffers_.empty()) { - return; - } - } - // Can't hold mutex_ during this call, since bufferCompleted - // will be called by libcupti and mutex_ is acquired there. -#ifdef HAS_CUPTI - CUPTI_CALL(cuptiActivityFlushAll(0)); -#endif - // FIXME: We might want to make sure we reuse - // the same memory during warmup and tracing. - // Also, try to use the amount of memory required - // for active tracing during warmup. - std::lock_guard guard(mutex_); - // Throw away ready buffers as a result of above flush - readyGpuTraceBuffers_ = nullptr; -} - -#ifdef HAS_CUPTI -void CUPTIAPI CuptiActivityApi::bufferCompletedTrampoline( - CUcontext ctx, - uint32_t streamId, - uint8_t* buffer, - size_t /* unused */, - size_t validSize) { - singleton().bufferCompleted(ctx, streamId, buffer, 0, validSize); -} - -void CuptiActivityApi::bufferCompleted( - CUcontext ctx, - uint32_t streamId, - uint8_t* buffer, - size_t /* unused */, - size_t validSize) { - - std::lock_guard guard(mutex_); - auto it = allocatedGpuTraceBuffers_.find(buffer); - if (it == allocatedGpuTraceBuffers_.end()) { - LOG(ERROR) << "bufferCompleted called with unknown buffer: " - << (void*) buffer; - return; - } - - if (!readyGpuTraceBuffers_) { - readyGpuTraceBuffers_ = std::make_unique(); - } - // Set valid size of buffer before moving to ready map - it->second->setSize(validSize); - (*readyGpuTraceBuffers_)[it->first] = std::move(it->second); - allocatedGpuTraceBuffers_.erase(it); - - // report any records dropped from the queue; to avoid unnecessary cupti - // API calls, we make it report only in verbose mode (it doesn't happen - // often in our testing anyways) - if (VLOG_IS_ON(1)) { - size_t dropped = 0; - CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); - if (dropped != 0) { - LOG(WARNING) << "Dropped " << dropped << " activity records"; - } - } -} -#endif - -void CuptiActivityApi::enableCuptiActivities( - const std::set& selected_activities) { -#ifdef HAS_CUPTI - static bool registered = false; - if (!registered) { - CUPTI_CALL( - cuptiActivityRegisterCallbacks(bufferRequestedTrampoline, bufferCompletedTrampoline)); - } - - externalCorrelationEnabled_ = false; - for (const auto& activity : selected_activities) { - if (activity == ActivityType::GPU_MEMCPY) { - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); - } - if (activity == ActivityType::GPU_MEMSET) { - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); - } - if (activity == ActivityType::CONCURRENT_KERNEL) { - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); - } - if (activity == ActivityType::EXTERNAL_CORRELATION) { - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); - externalCorrelationEnabled_ = true; - } - if (activity == ActivityType::CUDA_RUNTIME) { - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); - } - if (activity == ActivityType::OVERHEAD) { - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); - } - } -#endif - - // Explicitly enabled, so reset this flag if set - stopCollection = false; -} - -void CuptiActivityApi::disableCuptiActivities( - const std::set& selected_activities) { -#ifdef HAS_CUPTI - for (const auto& activity : selected_activities) { - if (activity == ActivityType::GPU_MEMCPY) { - CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); - } - if (activity == ActivityType::GPU_MEMSET) { - CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); - } - if (activity == ActivityType::CONCURRENT_KERNEL) { - CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); - } - if (activity == ActivityType::EXTERNAL_CORRELATION) { - CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); - } - if (activity == ActivityType::CUDA_RUNTIME) { - CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); - } - if (activity == ActivityType::OVERHEAD) { - CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); - } - } - externalCorrelationEnabled_ = false; -#endif -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.h b/plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.h deleted file mode 100644 index 92af51ecac9..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityApi.h +++ /dev/null @@ -1,100 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#ifdef HAS_CUPTI -#include -#endif - -#include "ActivityType.h" -#include "CuptiActivityBuffer.h" - - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -#ifndef HAS_CUPTI -using CUpti_Activity = void; -#endif - -class CuptiActivityApi { - public: - enum CorrelationFlowType { - Default, - User - }; - - CuptiActivityApi() = default; - CuptiActivityApi(const CuptiActivityApi&) = delete; - CuptiActivityApi& operator=(const CuptiActivityApi&) = delete; - - virtual ~CuptiActivityApi() {} - - static CuptiActivityApi& singleton(); - - virtual int smCount(); - static void pushCorrelationID(int id, CorrelationFlowType type); - static void popCorrelationID(CorrelationFlowType type); - - void enableCuptiActivities( - const std::set& selected_activities); - void disableCuptiActivities( - const std::set& selected_activities); - void clearActivities(); - - virtual std::unique_ptr activityBuffers(); - - virtual const std::pair processActivities( - CuptiActivityBufferMap&, - std::function handler); - - void setMaxBufferSize(int size); - - std::atomic_bool stopCollection{false}; - int64_t flushOverhead{0}; - - static void forceLoadCupti(); - - private: -#ifdef HAS_CUPTI - int processActivitiesForBuffer( - uint8_t* buf, - size_t validSize, - std::function handler); - static void CUPTIAPI - bufferRequestedTrampoline(uint8_t** buffer, size_t* size, size_t* maxNumRecords); - static void CUPTIAPI bufferCompletedTrampoline( - CUcontext ctx, - uint32_t streamId, - uint8_t* buffer, - size_t /* unused */, - size_t validSize); -#endif // HAS_CUPTI - - int maxGpuBufferCount_{0}; - CuptiActivityBufferMap allocatedGpuTraceBuffers_; - std::unique_ptr readyGpuTraceBuffers_; - std::mutex mutex_; - bool externalCorrelationEnabled_{false}; - - protected: -#ifdef HAS_CUPTI - void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords); - void bufferCompleted( - CUcontext ctx, - uint32_t streamId, - uint8_t* buffer, - size_t /* unused */, - size_t validSize); -#endif -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityBuffer.h b/plugins/tensorboard-plugins/libkineto/src/CuptiActivityBuffer.h deleted file mode 100644 index 1c3fbef62c8..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityBuffer.h +++ /dev/null @@ -1,51 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "ITraceActivity.h" - -namespace KINETO_NAMESPACE { - -class CuptiActivityBuffer { - public: - explicit CuptiActivityBuffer(size_t size) : size_(size) { - buf_.reserve(size); - } - CuptiActivityBuffer() = delete; - CuptiActivityBuffer& operator=(const CuptiActivityBuffer&) = delete; - CuptiActivityBuffer(CuptiActivityBuffer&&) = default; - CuptiActivityBuffer& operator=(CuptiActivityBuffer&&) = default; - - size_t size() const { - return size_; - } - - void setSize(size_t size) { - assert(size <= buf_.capacity()); - size_ = size; - } - - uint8_t* data() { - return buf_.data(); - } - - private: - - std::vector buf_; - size_t size_; - - std::vector> wrappers_; -}; - -using CuptiActivityBufferMap = - std::map>; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.cpp deleted file mode 100644 index fa2ef2f3a8c..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#include - -namespace chrono = std::chrono; - -namespace KINETO_NAMESPACE { - -#ifdef _WIN32 -uint64_t epochs_diff() { - // On Windows, steady_clock wraps the QueryPerformanceCounter function. - // https://docs.microsoft.com/en-us/cpp/standard-library/steady-clock-struct?view=msvc-160 - auto steady = - chrono::time_point_cast(chrono::steady_clock::now()); - auto system = - chrono::time_point_cast(chrono::system_clock::now()); - - auto time_since_unix = system.time_since_epoch().count(); - auto time_since_boot = steady.time_since_epoch().count(); - return time_since_unix - time_since_boot; -} - -uint64_t unixEpochTimestamp(uint64_t ts) { - static uint64_t diff = epochs_diff(); - return ts + diff; -} -#else -uint64_t unixEpochTimestamp(uint64_t ts) { - return ts; -} -#endif // _WIN32 - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.h b/plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.h deleted file mode 100644 index 78de8373d5f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityPlatform.h +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -#include - -namespace KINETO_NAMESPACE { - -// cupti's timestamps are platform specific. This function convert the raw -// cupti timestamp to time since unix epoch. So that on different platform, -// correction can work correctly. -uint64_t unixEpochTimestamp(uint64_t ts); - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.cpp deleted file mode 100644 index 97c23ef047d..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.cpp +++ /dev/null @@ -1,841 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "CuptiActivityProfiler.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef HAS_CUPTI -#include -#endif - -#include "Config.h" -#include "time_since_epoch.h" -#ifdef HAS_CUPTI -#include "CuptiActivity.h" -#include "CuptiActivity.tpp" -#include "CuptiActivityApi.h" -#endif // HAS_CUPTI -#ifdef HAS_ROCTRACER -#include "RoctracerActivityApi.h" -#endif -#include "output_base.h" - -#include "Logger.h" -#include "ThreadUtil.h" - -using namespace std::chrono; -using namespace libkineto; -using std::string; - -namespace KINETO_NAMESPACE { - -void CuptiActivityProfiler::transferCpuTrace( - std::unique_ptr cpuTrace) { - std::lock_guard guard(mutex_); - const string& trace_name = cpuTrace->span.name; - if (currentRunloopState_ != RunloopState::CollectTrace && - currentRunloopState_ != RunloopState::ProcessTrace) { - VLOG(0) << "Trace collection not in progress - discarding span " - << trace_name; - return; - } - - cpuTrace->span.iteration = iterationCountMap_[trace_name]++; - - VLOG(0) << "Received iteration " << cpuTrace->span.iteration << " of span " - << trace_name << " (" << cpuTrace->activities.size() << " activities / " - << cpuTrace->gpuOpCount << " gpu activities)"; - traceBuffers_->cpu.push_back(std::move(cpuTrace)); -} - -#ifdef HAS_ROCTRACER -CuptiActivityProfiler::CuptiActivityProfiler(RoctracerActivityApi& cupti, bool cpuOnly) -#else -CuptiActivityProfiler::CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly) -#endif - : cupti_(cupti), - flushOverhead_{0, 0}, - setupOverhead_{0, 0}, - cpuOnly_{cpuOnly}, - currentRunloopState_{RunloopState::WaitForRequest}, - stopCollection_{false} {} - -void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) { - LOG(INFO) << "Processing " << traceBuffers_->cpu.size() - << " CPU buffers"; - VLOG(0) << "Profile time range: " << captureWindowStartTime_ << " - " - << captureWindowEndTime_; - logger.handleTraceStart(metadata_); - for (auto& cpu_trace : traceBuffers_->cpu) { - string trace_name = cpu_trace->span.name; - VLOG(0) << "Processing CPU buffer for " << trace_name << " (" - << cpu_trace->span.iteration << ") - " - << cpu_trace->activities.size() << " records"; - VLOG(0) << "Span time range: " << cpu_trace->span.startTime << " - " - << cpu_trace->span.endTime; - processCpuTrace(*cpu_trace, logger); - LOGGER_OBSERVER_ADD_EVENT_COUNT(cpu_trace->activities.size()); - } - -#ifdef HAS_CUPTI - if (!cpuOnly_) { - VLOG(0) << "Retrieving GPU activity buffers"; - traceBuffers_->gpu = cupti_.activityBuffers(); - if (VLOG_IS_ON(1)) { - addOverheadSample(flushOverhead_, cupti_.flushOverhead); - } - if (traceBuffers_->gpu) { - const auto count_and_size = cupti_.processActivities( - *traceBuffers_->gpu, - std::bind(&CuptiActivityProfiler::handleCuptiActivity, this, std::placeholders::_1, &logger)); - LOG(INFO) << "Processed " << count_and_size.first - << " GPU records (" << count_and_size.second << " bytes)"; - LOGGER_OBSERVER_ADD_EVENT_COUNT(count_and_size.first); - } - } -#endif // HAS_CUPTI -#ifdef HAS_ROCTRACER - if (!cpuOnly_) { - VLOG(0) << "Retrieving GPU activity buffers"; - const int count = cupti_.processActivities(logger); - LOG(INFO) << "Processed " << count - << " GPU records"; - LOGGER_OBSERVER_ADD_EVENT_COUNT(count); - } -#endif // HAS_ROCTRACER - - for (const auto& session : sessions_){ - LOG(INFO) << "Processing child profiler trace"; - session->processTrace(logger); - } - - finalizeTrace(*config_, logger); -} - -CuptiActivityProfiler::CpuGpuSpanPair& CuptiActivityProfiler::recordTraceSpan( - TraceSpan& span, int gpuOpCount) { - TraceSpan gpu_span(gpuOpCount, span.iteration, span.name, "GPU: "); - auto& iterations = traceSpans_[span.name]; - iterations.push_back({span, gpu_span}); - return iterations.back(); -} - -void CuptiActivityProfiler::processCpuTrace( - libkineto::CpuTraceBuffer& cpuTrace, - ActivityLogger& logger) { - if (cpuTrace.activities.size() == 0) { - LOG(WARNING) << "CPU trace is empty!"; - return; - } - - CpuGpuSpanPair& span_pair = recordTraceSpan(cpuTrace.span, cpuTrace.gpuOpCount); - TraceSpan& cpu_span = span_pair.first; - for (auto const& act : cpuTrace.activities) { - VLOG(2) << act.correlationId() << ": OP " << act.activityName; - if (config_->selectedActivityTypes().count(act.type())) { - act.log(logger); - } - clientActivityTraceMap_[act.correlationId()] = &span_pair; - activityMap_[act.correlationId()] = &act; - - recordThreadInfo(act.resourceId(), act.getThreadId(), act.deviceId()); - } - logger.handleTraceSpan(cpu_span); -} - -#ifdef HAS_CUPTI -inline void CuptiActivityProfiler::handleCorrelationActivity( - const CUpti_ActivityExternalCorrelation* correlation) { - if (correlation->externalKind == CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0) { - cpuCorrelationMap_[correlation->correlationId] = correlation->externalId; - } else if (correlation->externalKind == CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1){ - userCorrelationMap_[correlation->correlationId] = correlation->externalId; - } else { - LOG(ERROR) << "Invalid CUpti_ActivityExternalCorrelation sent to handleCuptiActivity"; - } -} -#endif // HAS_CUPTI - -static GenericTraceActivity createUserGpuSpan( - const libkineto::ITraceActivity& cpuTraceActivity, - const libkineto::ITraceActivity& gpuTraceActivity) { - GenericTraceActivity res( - *cpuTraceActivity.traceSpan(), - ActivityType::GPU_USER_ANNOTATION, - cpuTraceActivity.name()); - res.startTime = gpuTraceActivity.timestamp(); - res.device = gpuTraceActivity.deviceId(); - res.resource = gpuTraceActivity.resourceId(); - res.endTime = - gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); - res.id = cpuTraceActivity.correlationId(); - return res; -} - -void CuptiActivityProfiler::GpuUserEventMap::insertOrExtendEvent( - const ITraceActivity& userActivity, - const ITraceActivity& gpuActivity) { - StreamKey key(gpuActivity.deviceId(), gpuActivity.resourceId()); - CorrelationSpanMap& correlationSpanMap = streamSpanMap_[key]; - auto it = correlationSpanMap.find(userActivity.correlationId()); - if (it == correlationSpanMap.end()) { - auto it_success = correlationSpanMap.insert({ - userActivity.correlationId(), createUserGpuSpan(userActivity, gpuActivity) - }); - it = it_success.first; - } - GenericTraceActivity& span = it->second; - if (gpuActivity.timestamp() < span.startTime || span.startTime == 0) { - span.startTime = gpuActivity.timestamp(); - } - int64_t gpu_activity_end = gpuActivity.timestamp() + gpuActivity.duration(); - if (gpu_activity_end > span.endTime) { - span.endTime = gpu_activity_end; - } -} - -const CuptiActivityProfiler::CpuGpuSpanPair& CuptiActivityProfiler::defaultTraceSpan() { - static TraceSpan span(0, 0, "Unknown", ""); - static CpuGpuSpanPair span_pair(span, span); - return span_pair; -} - -void CuptiActivityProfiler::GpuUserEventMap::logEvents(ActivityLogger *logger) { - for (auto const& streamMapPair : streamSpanMap_) { - for (auto const& correlationSpanPair : streamMapPair.second) { - correlationSpanPair.second.log(*logger); - } - } -} - -#ifdef HAS_CUPTI -inline bool CuptiActivityProfiler::outOfRange(const ITraceActivity& act) { - bool out_of_range = act.timestamp() < captureWindowStartTime_ || - (act.timestamp() + act.duration()) > captureWindowEndTime_; - if (out_of_range) { - VLOG(2) << "TraceActivity outside of profiling window: " << act.name() - << " (" << act.timestamp() << " < " << captureWindowStartTime_ << " or " - << (act.timestamp() + act.duration()) << " > " << captureWindowEndTime_; - } - return out_of_range; -} - -inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) { - // Some CUDA calls that are very frequent and also not very interesting. - // Filter these out to reduce trace size. - if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 || - // Don't care about cudaEvents - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020) { - return true; - } - - return false; -} - -void CuptiActivityProfiler::handleRuntimeActivity( - const CUpti_ActivityAPI* activity, - ActivityLogger* logger) { - if (isBlockListedRuntimeCbid(activity->cbid)) { - return; - } - VLOG(2) << activity->correlationId - << ": CUPTI_ACTIVITY_KIND_RUNTIME, cbid=" << activity->cbid - << " tid=" << activity->threadId; - int32_t tid = activity->threadId; - const auto& it = resourceInfo_.find({processId(), tid}); - if (it != resourceInfo_.end()) { - tid = it->second.id; - } - const ITraceActivity* linked = linkedActivity( - activity->correlationId, cpuCorrelationMap_); - const auto& runtime_activity = - traceBuffers_->addActivityWrapper(RuntimeActivity(activity, linked, tid)); - checkTimestampOrder(&runtime_activity); - if (outOfRange(runtime_activity)) { - return; - } - runtime_activity.log(*logger); -} - -void CuptiActivityProfiler::handleOverheadActivity( - const CUpti_ActivityOverhead* activity, - ActivityLogger* logger) { - VLOG(2) << ": CUPTI_ACTIVITY_KIND_OVERHEAD" << " overheadKind=" << activity->overheadKind; - - const auto& overhead_activity = - traceBuffers_->addActivityWrapper(OverheadActivity(activity, nullptr)); - overhead_activity.log(*logger); -} - - -inline void CuptiActivityProfiler::updateGpuNetSpan( - const ITraceActivity& gpuOp) { - if (!gpuOp.linkedActivity()) { - VLOG(0) << "Missing linked activity"; - return; - } - const auto& it = clientActivityTraceMap_.find( - gpuOp.linkedActivity()->correlationId()); - if (it == clientActivityTraceMap_.end()) { - // No correlation id mapping? - return; - } - TraceSpan& gpu_span = it->second->second; - if (gpuOp.timestamp() < gpu_span.startTime || gpu_span.startTime == 0) { - gpu_span.startTime = gpuOp.timestamp(); - } - if ((gpuOp.timestamp() + gpuOp.duration()) > gpu_span.endTime) { - gpu_span.endTime = gpuOp.timestamp() + gpuOp.duration(); - } -} - -// I've observed occasional broken timestamps attached to GPU events... -void CuptiActivityProfiler::checkTimestampOrder(const ITraceActivity* act1) { - // Correlated GPU runtime activity cannot - // have timestamp greater than the GPU activity's - const auto& it = correlatedCudaActivities_.find(act1->correlationId()); - if (it == correlatedCudaActivities_.end()) { - correlatedCudaActivities_.insert({act1->correlationId(), act1}); - return; - } - - // Activities may be appear in the buffers out of order. - // If we have a runtime activity in the map, it should mean that we - // have a GPU activity passed in, and vice versa. - const ITraceActivity* act2 = it->second; - if (act2->type() == ActivityType::CUDA_RUNTIME) { - // Buffer is out-of-order. - // Swap so that runtime activity is first for the comparison below. - std::swap(act1, act2); - } - if (act1->timestamp() > act2->timestamp()) { - LOG(WARNING) << "GPU op timestamp (" << act2->timestamp() - << ") < runtime timestamp (" << act1->timestamp() << ") by " - << act1->timestamp() - act2->timestamp() << "us"; - LOG(WARNING) << "Name: " << act2->name() - << " Device: " << act2->deviceId() - << " Stream: " << act2->resourceId(); - } -} - -inline void CuptiActivityProfiler::handleGpuActivity( - const ITraceActivity& act, - ActivityLogger* logger) { - if (outOfRange(act)) { - return; - } - checkTimestampOrder(&act); - VLOG(2) << act.correlationId() << ": " - << act.name(); - recordStream(act.deviceId(), act.resourceId(), ""); - act.log(*logger); - updateGpuNetSpan(act); - if (config_->selectedActivityTypes().count(ActivityType::GPU_USER_ANNOTATION)) { - const auto& it = userCorrelationMap_.find(act.correlationId()); - if (it != userCorrelationMap_.end()) { - const auto& it2 = activityMap_.find(it->second); - if (it2 != activityMap_.end()) { - recordStream(act.deviceId(), act.resourceId(), "context"); - gpuUserEventMap_.insertOrExtendEvent(*it2->second, act); - } - } - } -} - -const ITraceActivity* CuptiActivityProfiler::linkedActivity( - int32_t correlationId, - const std::unordered_map& correlationMap) { - const auto& it = correlationMap.find(correlationId); - if (it != correlationMap.end()) { - const auto& it2 = activityMap_.find(it->second); - if (it2 != activityMap_.end()) { - return it2->second; - } - } - return nullptr; -} - -template -inline void CuptiActivityProfiler::handleGpuActivity( - const T* act, ActivityLogger* logger) { - const ITraceActivity* linked = linkedActivity( - act->correlationId, cpuCorrelationMap_); - const auto& gpu_activity = - traceBuffers_->addActivityWrapper(GpuActivity(act, linked)); - handleGpuActivity(gpu_activity, logger); -} - -void CuptiActivityProfiler::handleCuptiActivity(const CUpti_Activity* record, ActivityLogger* logger) { - switch (record->kind) { - case CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION: - handleCorrelationActivity( - reinterpret_cast( - record)); - break; - case CUPTI_ACTIVITY_KIND_RUNTIME: - handleRuntimeActivity( - reinterpret_cast(record), logger); - break; - case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: - handleGpuActivity( - reinterpret_cast(record), logger); - break; - case CUPTI_ACTIVITY_KIND_MEMCPY: - handleGpuActivity( - reinterpret_cast(record), logger); - break; - case CUPTI_ACTIVITY_KIND_MEMCPY2: - handleGpuActivity( - reinterpret_cast(record), logger); - break; - case CUPTI_ACTIVITY_KIND_MEMSET: - handleGpuActivity( - reinterpret_cast(record), logger); - break; - case CUPTI_ACTIVITY_KIND_OVERHEAD: - handleOverheadActivity (reinterpret_cast(record), logger); - break; - default: - LOG(WARNING) << "Unexpected activity type: " << record->kind; - break; - } -} -#endif // HAS_CUPTI - -void CuptiActivityProfiler::configureChildProfilers() { - // If child profilers are enabled create profiler sessions - for (auto& profiler: profilers_) { - int64_t start_time_ms = duration_cast( - profileStartTime_.time_since_epoch()).count(); - LOG(INFO) << "Running child profiler " << profiler->name() << " for " - << config_->activitiesDuration().count() << " ms"; - auto session = profiler->configure( - start_time_ms, - config_->activitiesDuration().count(), - config_->selectedActivityTypes(), - *config_ - ); - if (session) { - sessions_.push_back(std::move(session)); - } - } -} - -void CuptiActivityProfiler::configure( - const Config& config, - const time_point& now) { - std::lock_guard guard(mutex_); - if (isActive()) { - LOG(ERROR) << "CuptiActivityProfiler already busy, terminating"; - return; - } - - config_ = config.clone(); - - if (config_->activitiesDuration().count() == 0) { - // Use default if not specified - config_->setActivitiesDuration( - config_->activitiesDurationDefault()); - } - - // Ensure we're starting in a clean state - resetTraceData(); - -#if !USE_GOOGLE_LOG - // Add a LoggerObserverCollector to collect all logs during the trace. - loggerCollectorMetadata_ = std::make_unique(); - Logger::addLoggerObserver(loggerCollectorMetadata_.get()); -#endif // !USE_GOOGLE_LOG - - profileStartTime_ = config_->requestTimestamp(); - - if (config_->hasProfileStartIteration()) { - profileStartIter_ = config_->profileStartIteration(); - profileEndIter_ = profileStartIter_ + config_->activitiesRunIterations(); - } else { - - profileStartIter_ = -1; - profileEndIter_ = (std::numeric_limits::max)(); - - if (profileStartTime_ < now) { - LOG(ERROR) << "Not starting tracing - start timestamp is in the past. Time difference (ms): " << duration_cast(now - profileStartTime_).count(); - return; - } else if ((profileStartTime_ - now) < config_->activitiesWarmupDuration()) { - LOG(ERROR) << "Not starting tracing - insufficient time for warmup. Time to warmup (ms): " << duration_cast(profileStartTime_ - now).count() ; - return; - } - } - - if (LOG_IS_ON(INFO)) { - config_->printActivityProfilerConfig(LIBKINETO_DBG_STREAM); - } - if (!cpuOnly_ && !libkineto::api().client()) { - if (profileStartIter_ < 0) { - LOG(INFO) << "GPU-only tracing for " - << config_->activitiesDuration().count() << "ms"; - } else { - LOG(INFO) << "GPU-only tracing for " - << config_->activitiesRunIterations() << " iterations"; - } - } - - // Set useful metadata into the logger. - LOGGER_OBSERVER_SET_TRACE_DURATION_MS(config_->activitiesDuration().count()); - if (!config_->requestTraceID().empty()) { - LOGGER_OBSERVER_SET_TRACE_ID(config_->requestTraceID()); - } - if (!config_->requestGroupTraceID().empty()) { - LOGGER_OBSERVER_SET_GROUP_TRACE_ID(config_->requestGroupTraceID()); - } - LOGGER_OBSERVER_ADD_DESTINATION(config_->activitiesLogUrl()); - -#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) - if (!cpuOnly_) { - // Enabling CUPTI activity tracing incurs a larger perf hit at first, - // presumably because structures are allocated and initialized, callbacks - // are activated etc. After a while the overhead decreases and stabilizes. - // It's therefore useful to perform some warmup before starting recording. - LOG(INFO) << "Enabling GPU tracing"; - cupti_.setMaxBufferSize(config_->activitiesMaxGpuBufferSize()); - - time_point timestamp; - if (VLOG_IS_ON(1)) { - timestamp = system_clock::now(); - } -#ifdef HAS_CUPTI - cupti_.enableCuptiActivities(config_->selectedActivityTypes()); -#else - cupti_.enableActivities(config_->selectedActivityTypes()); -#endif - if (VLOG_IS_ON(1)) { - auto t2 = system_clock::now(); - addOverheadSample( - setupOverhead_, duration_cast(t2 - timestamp).count()); - } - } -#endif // HAS_CUPTI || HAS_ROCTRACER - - if (profilers_.size() > 0) { - configureChildProfilers(); - } - - if (libkineto::api().client()) { - libkineto::api().client()->warmup(config_->isOpInputsCollectionEnabled()); - } - if (profileStartIter_ >= 0) { - LOG(INFO) << "Tracing starting on iteration = " << profileStartIter_; - } else { - LOG(INFO) << "Tracing starting in " - << duration_cast(profileStartTime_ - now).count() << "s"; - } - - traceBuffers_ = std::make_unique(); - captureWindowStartTime_ = captureWindowEndTime_ = 0; - currentRunloopState_ = RunloopState::Warmup; -} - -void CuptiActivityProfiler::startTraceInternal(const time_point& now) { - captureWindowStartTime_ = libkineto::timeSinceEpoch(now); - VLOG(0) << "Warmup -> CollectTrace"; - for (auto& session: sessions_){ - LOG(INFO) << "Starting child profiler session"; - session->start(); - } - currentRunloopState_ = RunloopState::CollectTrace; -} - -void CuptiActivityProfiler::stopTraceInternal(const time_point& now) { - if (captureWindowEndTime_ == 0) { - captureWindowEndTime_ = libkineto::timeSinceEpoch(now); - } -#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) - if (!cpuOnly_) { - time_point timestamp; - if (VLOG_IS_ON(1)) { - timestamp = system_clock::now(); - } -#ifdef HAS_CUPTI - cupti_.disableCuptiActivities(config_->selectedActivityTypes()); -#else - cupti_.disableActivities(config_->selectedActivityTypes()); -#endif - if (VLOG_IS_ON(1)) { - auto t2 = system_clock::now(); - addOverheadSample( - setupOverhead_, duration_cast(t2 - timestamp).count()); - } - } -#endif // HAS_CUPTI || HAS_ROCTRACER - - if (currentRunloopState_ == RunloopState::CollectTrace) { - VLOG(0) << "CollectTrace -> ProcessTrace"; - } else { - LOG(WARNING) << "Called stopTrace with state == " << - static_cast::type>( - currentRunloopState_.load()); - } - for (auto& session: sessions_){ - LOG(INFO) << "Stopping child profiler session"; - session->stop(); - } - currentRunloopState_ = RunloopState::ProcessTrace; -} - -void CuptiActivityProfiler::resetInternal() { - resetTraceData(); - currentRunloopState_ = RunloopState::WaitForRequest; -} - -bool CuptiActivityProfiler::isWarmupDone( - const time_point& now, - int64_t currentIter) const { - // is it a time based config - if (profileStartIter_ < 0) { - // qualify that this check is not being called from application step() API - // this avoids races between the step() API and periodically invoked - // profiler run loop step() method - return (currentIter < 0) && (now >= profileStartTime_); - } - // this is an iteration based config - if (currentIter < 0) { - return false; - } - return currentIter >= profileStartIter_; -} - -bool CuptiActivityProfiler::isCollectionDone( - const time_point& now, - int64_t currentIter) const { - // is it a time based config - if (profileStartIter_ < 0) { - // qualify that this check is not being called from application step() API - return (currentIter < 0) && (now >= profileEndTime_); - } - // this is an iteration based config - if (currentIter < 0) { - return false; - } - return currentIter >= profileEndIter_; -} - -const time_point CuptiActivityProfiler::performRunLoopStep( - const time_point& now, - const time_point& nextWakeupTime, - int64_t currentIter) { - auto new_wakeup_time = nextWakeupTime; - bool warmup_done = false, collection_done = false; - - VLOG_IF(1, currentIter >= 0) << "Run loop on application step(), iteration = " - << currentIter; - - switch (currentRunloopState_) { - case RunloopState::WaitForRequest: - VLOG(1) << "State: WaitForRequest"; - // Nothing to do - break; - - case RunloopState::Warmup: - VLOG(1) << "State: Warmup"; - warmup_done = isWarmupDone(now, currentIter); -#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) - // Flushing can take a while so avoid doing it close to the start time - if (!cpuOnly_ && currentIter < 0 && - (profileStartIter_ >= 0 || nextWakeupTime < profileStartTime_)) { - cupti_.clearActivities(); - } - - if (cupti_.stopCollection) { - // Go to process trace to clear any outstanding buffers etc - LOG(WARNING) << "Trace terminated during warmup"; - std::lock_guard guard(mutex_); - stopTraceInternal(now); - resetInternal(); - VLOG(0) << "Warmup -> WaitForRequest"; - break; - } -#endif // HAS_CUPTI || HAS_ROCTRACER - - if (warmup_done) { - UST_LOGGER_MARK_COMPLETED(kWarmUpStage); - if (profileStartIter_ < 0 && - (now > profileStartTime_ + milliseconds(10))) { - LOG(WARNING) - << "Tracing started " - << duration_cast(now - profileStartTime_).count() - << "ms late!"; - } else { - LOG(INFO) << "Tracing started"; - } - startTrace(now); - if (libkineto::api().client()) { - libkineto::api().client()->start(); - } - if (nextWakeupTime > profileEndTime_) { - new_wakeup_time = profileEndTime_; - } - } else if (nextWakeupTime > profileStartTime_) { - new_wakeup_time = profileStartTime_; - } - - break; - - case RunloopState::CollectTrace: - VLOG(1) << "State: CollectTrace"; - // captureWindowStartTime_ can be set by external threads, - // so recompute end time. - // FIXME: Is this a good idea for synced start? - if (profileStartIter_ < 0) { - std::lock_guard guard(mutex_); - profileEndTime_ = time_point( - microseconds(captureWindowStartTime_)) + - config_->activitiesDuration(); - } - - collection_done = isCollectionDone(now, currentIter); - - // TODO revisit stopCollection_ is not used right now - if (collection_done || stopCollection_.exchange(false) -#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) - || cupti_.stopCollection -#endif // HAS_CUPTI || HAS_ROCTRACER - ){ - // Update runloop state first to prevent further updates to shared state - LOG(INFO) << "Tracing complete."; - if (currentIter > 0) { - LOG(INFO) << "This state change was invoked by application's step() call"; - } - // FIXME: Need to communicate reason for stopping on errors - if (libkineto::api().client()) { - libkineto::api().client()->stop(); - } - std::lock_guard guard(mutex_); - stopTraceInternal(now); - VLOG_IF(0, collection_done) << "Reached profile end time"; - - UST_LOGGER_MARK_COMPLETED(kCollectionStage); - } else if (profileStartIter_ >= 0) { - // nothing to do here - } else if (now < profileEndTime_ && profileEndTime_ < nextWakeupTime) { - new_wakeup_time = profileEndTime_; - } - - break; - - case RunloopState::ProcessTrace: - VLOG(1) << "State: ProcessTrace"; - // skip this state transition if it called from the step() api - // of the profiler. - // else it could lead to a race between the profiler thread and an - // application thread calling step() - if (currentIter >= 0) { - return new_wakeup_time; - } - // FIXME: Probably want to allow interruption here - // for quickly handling trace request via synchronous API - std::lock_guard guard(mutex_); - processTraceInternal(*logger_); - UST_LOGGER_MARK_COMPLETED(kPostProcessingStage); - resetInternal(); - VLOG(0) << "ProcessTrace -> WaitForRequest"; - break; - } - - return new_wakeup_time; -} - -void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logger) { - LOG(INFO) << "Recorded nets:"; - { - for (const auto& it : iterationCountMap_) { - LOG(INFO) << it.first << ": " << it.second << " iterations"; - } - iterationCountMap_.clear(); - } - - // Process names - int32_t pid = processId(); - string process_name = processName(pid); - if (!process_name.empty()) { - logger.handleDeviceInfo( - {pid, process_name, "CPU"}, captureWindowStartTime_); - if (!cpuOnly_) { - // GPU events use device id as pid (0-7). - constexpr int kMaxGpuCount = 8; - for (int gpu = 0; gpu < kMaxGpuCount; gpu++) { - logger.handleDeviceInfo( - {gpu, process_name, fmt::format("GPU {}", gpu)}, - captureWindowStartTime_); - } - } - } - - // Thread & stream info - for (auto pair : resourceInfo_) { - const auto& resource = pair.second; - logger.handleResourceInfo(resource, captureWindowStartTime_); - } - - for (const auto& iterations : traceSpans_) { - for (const auto& span_pair : iterations.second) { - const TraceSpan& gpu_span = span_pair.second; - if (gpu_span.opCount > 0) { - logger.handleTraceSpan(gpu_span); - } - } - } - - // Overhead info - overheadInfo_.push_back(ActivityLogger::OverheadInfo("CUPTI Overhead")); - for(const auto& info : overheadInfo_) { - logger.handleOverheadInfo(info, captureWindowStartTime_); - } - - gpuUserEventMap_.logEvents(&logger); - -#if !USE_GOOGLE_LOG - // Save logs from LoggerCollector objects into Trace metadata. - auto LoggerMD = loggerCollectorMetadata_->extractCollectorMetadata(); - std::unordered_map> LoggerMDString; - for (auto& md : LoggerMD) { - LoggerMDString[toString(md.first)] = md.second; - } -#endif // !USE_GOOGLE_LOG - - logger.finalizeTrace(config, std::move(traceBuffers_), captureWindowEndTime_, LoggerMDString); -} - -void CuptiActivityProfiler::resetTraceData() { -#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) - if (!cpuOnly_) { - cupti_.clearActivities(); - } -#endif // HAS_CUPTI || HAS_ROCTRACER - activityMap_.clear(); - cpuCorrelationMap_.clear(); - correlatedCudaActivities_.clear(); - gpuUserEventMap_.clear(); - traceSpans_.clear(); - clientActivityTraceMap_.clear(); - traceBuffers_ = nullptr; - metadata_.clear(); - sessions_.clear(); -#if !USE_GOOGLE_LOG - Logger::removeLoggerObserver(loggerCollectorMetadata_.get()); -#endif // !USE_GOOGLE_LOG -} - - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.h b/plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.h deleted file mode 100644 index 208833a4db7..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiActivityProfiler.h +++ /dev/null @@ -1,364 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "ThreadUtil.h" -#include "TraceSpan.h" -#include "libkineto.h" -#include "output_base.h" -#include "GenericTraceActivity.h" -#include "IActivityProfiler.h" -#include "LoggerCollector.h" - -namespace KINETO_NAMESPACE { - -class Config; -class CuptiActivityApi; -class RoctracerActivityApi; - -class CuptiActivityProfiler { - public: - CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly); - CuptiActivityProfiler(RoctracerActivityApi& rai, bool cpuOnly); - CuptiActivityProfiler(const CuptiActivityProfiler&) = delete; - CuptiActivityProfiler& operator=(const CuptiActivityProfiler&) = delete; - - bool isActive() const { - return currentRunloopState_ != RunloopState::WaitForRequest; - } - - // Invoke at a regular interval to perform profiling activities. - // When not active, an interval of 1-5 seconds is probably fine, - // depending on required warm-up time and delayed start time. - // When active, it's a good idea to invoke more frequently to stay below - // memory usage limit (ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB) during warmup. - const std::chrono::time_point performRunLoopStep( - const std::chrono::time_point& now, - const std::chrono::time_point& nextWakeupTime, - int64_t currentIter = -1); - - // Used for async requests - void setLogger(ActivityLogger* logger) { - logger_ = logger; - } - - // Synchronous control API - void startTrace( - const std::chrono::time_point& now) { - std::lock_guard guard(mutex_); - startTraceInternal(now); - } - - void stopTrace(const std::chrono::time_point& now) { - std::lock_guard guard(mutex_); - stopTraceInternal(now); - } - - // Process CPU and GPU traces - void processTrace(ActivityLogger& logger) { - std::lock_guard guard(mutex_); - processTraceInternal(logger); - } - - void reset() { - std::lock_guard guard(mutex_); - resetInternal(); - } - - // Set up profiler as specified in config. - void configure( - const Config& config, - const std::chrono::time_point& now); - - // Registered with client API to pass CPU trace events over - void transferCpuTrace( - std::unique_ptr cpuTrace); - - Config& config() { - return *config_; - } - - inline void recordThreadInfo() { - int32_t sysTid = systemThreadId(); - // Note we're using the lower 32 bits of the (opaque) pthread id - // as key, because that's what CUPTI records. - int32_t tid = threadId(); - int32_t pid = processId(); - std::lock_guard guard(mutex_); - recordThreadInfo(sysTid, tid, pid); - } - - // T107508020: We can deprecate the recordThreadInfo(void) once we optimized profiler_kineto - void recordThreadInfo(int32_t sysTid, int32_t tid, int32_t pid) { - if (resourceInfo_.find({pid, tid}) == resourceInfo_.end()) { - resourceInfo_.emplace( - std::make_pair(pid, tid), - ActivityLogger::ResourceInfo( - pid, - sysTid, - sysTid, // sortindex - fmt::format("thread {} ({})", sysTid, getThreadName()))); - } - } - - void addMetadata(const std::string& key, const std::string& value) { - std::lock_guard guard(mutex_); - metadata_[key] = value; - } - - void addChildActivityProfiler( - std::unique_ptr profiler) { - std::lock_guard guard(mutex_); - profilers_.push_back(std::move(profiler)); - } - - protected: - - using CpuGpuSpanPair = std::pair; - static const CpuGpuSpanPair& defaultTraceSpan(); - - private: - - // Map of gpu activities to user defined events - class GpuUserEventMap { - public: - // Insert a user defined event which maps to the gpu trace activity. - // If the user defined event mapping already exists this will update the - // gpu side span to include the span of gpuTraceActivity. - void insertOrExtendEvent(const ITraceActivity& cpuTraceActivity, - const ITraceActivity& gpuTraceActivity); - // Log out the events to the logger - void logEvents(ActivityLogger *logger); - - void clear() { - streamSpanMap_.clear(); - } - - private: - // device id and stream name - using StreamKey = std::pair; - - // map of correlation id to TraceSpan - using CorrelationSpanMap = - std::unordered_map; - std::map streamSpanMap_; - }; - - GpuUserEventMap gpuUserEventMap_; - // id -> activity* - std::unordered_map activityMap_; - // cuda runtime id -> pytorch op id - // CUPTI provides a mechanism for correlating Cuda events to arbitrary - // external events, e.g.operator activities from PyTorch. - std::unordered_map cpuCorrelationMap_; - // CUDA runtime <-> GPU Activity - std::unordered_map - correlatedCudaActivities_; - std::unordered_map userCorrelationMap_; - - // data structure to collect cuptiActivityFlushAll() latency overhead - struct profilerOverhead { - int64_t overhead; - int cntr; - }; - - bool isWarmupDone( - const std::chrono::time_point& now, - int64_t currentIter) const; - - bool isCollectionDone( - const std::chrono::time_point& now, - int64_t currentIter) const; - - void startTraceInternal( - const std::chrono::time_point& now); - - void stopTraceInternal( - const std::chrono::time_point& now); - - void processTraceInternal(ActivityLogger& logger); - - void resetInternal(); - - void finalizeTrace(const Config& config, ActivityLogger& logger); - - void configureChildProfilers(); - - // Process a single CPU trace - void processCpuTrace( - libkineto::CpuTraceBuffer& cpuTrace, - ActivityLogger& logger); - - // Create resource names for streams - inline void recordStream(int device, int id, const char* postfix) { - if (resourceInfo_.find({device, id}) == resourceInfo_.end()) { - resourceInfo_.emplace( - std::make_pair(device, id), - ActivityLogger::ResourceInfo( - device, id, id, fmt::format( - "stream {} {}", id, postfix))); - } - } - - // Record client trace span for subsequent lookups from activities - // Also creates a corresponding GPU-side span. - CpuGpuSpanPair& recordTraceSpan(TraceSpan& span, int gpuOpCount); - - // Returns true if net name is to be tracked for a specified number of - // iterations. - bool iterationTargetMatch(libkineto::CpuTraceBuffer& trace); - - // net name to id - int netId(const std::string& netName); - - const ITraceActivity* linkedActivity( - int32_t correlationId, - const std::unordered_map& correlationMap); - -#ifdef HAS_CUPTI - // Process generic CUPTI activity - void handleCuptiActivity(const CUpti_Activity* record, ActivityLogger* logger); - - // Process specific GPU activity types - void updateGpuNetSpan(const ITraceActivity& gpuOp); - bool outOfRange(const ITraceActivity& act); - void handleCorrelationActivity( - const CUpti_ActivityExternalCorrelation* correlation); - void handleRuntimeActivity( - const CUpti_ActivityAPI* activity, ActivityLogger* logger); - void handleOverheadActivity( - const CUpti_ActivityOverhead* activity, ActivityLogger* logger); - void handleGpuActivity(const ITraceActivity& act, - ActivityLogger* logger); - template - void handleGpuActivity(const T* act, ActivityLogger* logger); -#endif // HAS_CUPTI - - void resetTraceData(); - - void addOverheadSample(profilerOverhead& counter, int64_t overhead) { - counter.overhead += overhead; - counter.cntr++; - } - int64_t getOverhead(const profilerOverhead& counter) { - if (counter.cntr == 0) { - return 0; - } - return counter.overhead / counter.cntr; - } - - void checkTimestampOrder(const ITraceActivity* act1); - - // On-demand request configuration - std::unique_ptr config_; - - // Logger used during trace processing - ActivityLogger* logger_; - - // Calls to CUPTI is encapsulated behind this interface -#ifdef HAS_ROCTRACER - RoctracerActivityApi& cupti_; // Design failure here -#else - CuptiActivityApi& cupti_; -#endif - - enum class RunloopState { - WaitForRequest, - Warmup, - CollectTrace, - ProcessTrace - }; - - // Start and end time used for triggering and stopping profiling - std::chrono::time_point profileStartTime_; - std::chrono::time_point profileEndTime_; - int64_t profileStartIter_ = -1, profileEndIter_ = -1; - - - // All recorded trace spans, both CPU and GPU - // Trace Id -> list of iterations. - // Using map of lists for the iterator semantics, since we are recording - // pointers to the elements in this structure. - std::map> traceSpans_; - - // Maintain a map of client trace activity to trace span. - // Maps correlation id -> TraceSpan* held by traceSpans_. - using ActivityTraceMap = std::unordered_map; - ActivityTraceMap clientActivityTraceMap_; - - // Cache thread names and system thread ids for pthread ids, - // and stream ids for GPU streams - std::map< - std::pair, - ActivityLogger::ResourceInfo> resourceInfo_; - - std::vector overheadInfo_; - - // the overhead to flush the activity buffer - profilerOverhead flushOverhead_; - // the overhead to enable/disable activity tracking - profilerOverhead setupOverhead_; - - bool cpuOnly_{false}; - - // *************************************************************************** - // Below state is shared with external threads. - // These need to either be atomic, accessed under lock or only used - // by external threads in separate runloop phases from the profiler thread. - // *************************************************************************** - - // Mutex to protect non-atomic access to below state - std::mutex mutex_; - - // Runloop phase - std::atomic currentRunloopState_{RunloopState::WaitForRequest}; - - // Keep track of the start time of the first net in the current trace. - // This is only relevant to Caffe2 as PyTorch does not have nets. - // All CUDA events before this time will be removed - // Can be written by external threads during collection. - int64_t captureWindowStartTime_{0}; - // Similarly, all CUDA API events after the last net event will be removed - int64_t captureWindowEndTime_{0}; - - // span name -> iteration count - std::map iterationCountMap_; - // Flag used to stop tracing from external api callback. - // Needs to be atomic since it's set from a different thread. - std::atomic_bool stopCollection_{false}; - - // Buffers where trace data is stored - std::unique_ptr traceBuffers_; - - // Trace metadata - std::unordered_map metadata_; - - // child activity profilers - std::vector> profilers_; - - // a vector of active profiler plugin sessions - std::vector> sessions_; - - // LoggerCollector to collect all LOGs during the trace -#if !USE_GOOGLE_LOG - std::unique_ptr loggerCollectorMetadata_; -#endif // !USE_GOOGLE_LOG -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.cpp deleted file mode 100644 index 1876003998d..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.cpp +++ /dev/null @@ -1,260 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "CuptiCallbackApi.h" - -#include -#include -#include -#include -#include - -#ifdef HAS_CUPTI -#include "cupti_call.h" -#endif -#include "Logger.h" - - -namespace KINETO_NAMESPACE { - -// limit on number of handles per callback type -constexpr size_t MAX_CB_FNS_PER_CB = 8; - -// Reader Writer lock types -using ReaderWriterLock = std::shared_timed_mutex; -using ReaderLockGuard = std::shared_lock; -using WriteLockGuard = std::unique_lock; - -static ReaderWriterLock callbackLock_; - -/* Callback Table : - * Overall goal of the design is to optimize the lookup of function - * pointers. The table is structured at two levels and the leaf - * elements in the table are std::list to enable fast access/inserts/deletes - * - * | - * -> cb id 0 -> std::list of callbacks - * ... - * -> cb id n -> std::list of callbacks - * | - * ... - * CallbackTable is the finaly table type above - * See type declrartions in header file. - */ - - -/* callback_switchboard : is the global callback handler we register - * with CUPTI. The goal is to make it as efficient as possible - * to re-direct to the registered callback(s). - * - * Few things to care about : - * a) use if/then switches rather than map/hash structures - * b) avoid dynamic memory allocations - * c) be aware of locking overheads - */ -#ifdef HAS_CUPTI -static void CUPTIAPI callback_switchboard( -#else -static void callback_switchboard( -#endif - void* /* unused */, - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData* cbInfo) { - - // below statement is likey going to call a mutex - // on the singleton access - CuptiCallbackApi::singleton().__callback_switchboard( - domain, cbid, cbInfo); -} - - -void CuptiCallbackApi::__callback_switchboard( - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData* cbInfo) { - VLOG(0) << "Callback: domain = " << domain << ", cbid = " << cbid; - CallbackList *cblist = nullptr; - - switch (domain) { - - // add the fastest path for kernel launch callbacks - // as these are the most frequent ones - case CUPTI_CB_DOMAIN_RUNTIME_API: - switch (cbid) { - case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000: - cblist = &callbacks_.runtime[ - CUDA_LAUNCH_KERNEL - __RUNTIME_CB_DOMAIN_START]; - break; - default: - break; - } - break; - - case CUPTI_CB_DOMAIN_RESOURCE: - switch (cbid) { - case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: - cblist = &callbacks_.resource[ - RESOURCE_CONTEXT_CREATED - __RESOURCE_CB_DOMAIN_START]; - break; - case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: - cblist = &callbacks_.resource[ - RESOURCE_CONTEXT_DESTROYED - __RESOURCE_CB_DOMAIN_START]; - break; - default: - break; - } - break; - - default: - return; - } - - // ignore callbacks that are not handled - if (cblist == nullptr) { - return; - } - - // make a copy of the callback list so we avoid holding lock - // in common case this should be just one func pointer copy - std::array callbacks; - int num_cbs = 0; - { - ReaderLockGuard rl(callbackLock_); - int i = 0; - for (auto it = cblist->begin(); - it != cblist->end() && i < MAX_CB_FNS_PER_CB; - it++, i++) { - callbacks[i] = *it; - } - num_cbs = i; - } - - for (int i = 0; i < num_cbs; i++) { - auto fn = callbacks[i]; - fn(domain, cbid, cbInfo); - } -} - -CuptiCallbackApi& CuptiCallbackApi::singleton() { - static CuptiCallbackApi instance; - return instance; -} - -CuptiCallbackApi::CuptiCallbackApi() { -#ifdef HAS_CUPTI - lastCuptiStatus_ = CUPTI_ERROR_UNKNOWN; - lastCuptiStatus_ = CUPTI_CALL_NOWARN( - cuptiSubscribe(&subscriber_, - (CUpti_CallbackFunc)callback_switchboard, - nullptr)); - - initSuccess_ = (lastCuptiStatus_ == CUPTI_SUCCESS); -#endif -} - -CuptiCallbackApi::CallbackList* CuptiCallbackApi::CallbackTable::lookup( - CUpti_CallbackDomain domain, CuptiCallBackID cbid) { - size_t idx; - - switch (domain) { - - case CUPTI_CB_DOMAIN_RESOURCE: - assert(cbid >= __RESOURCE_CB_DOMAIN_START); - assert(cbid < __RESOURCE_CB_DOMAIN_END); - idx = cbid - __RESOURCE_CB_DOMAIN_START; - return &resource.at(idx); - - case CUPTI_CB_DOMAIN_RUNTIME_API: - assert(cbid >= __RUNTIME_CB_DOMAIN_START); - assert(cbid < __RUNTIME_CB_DOMAIN_END); - idx = cbid - __RUNTIME_CB_DOMAIN_START; - return &runtime.at(idx); - - default: - LOG(WARNING) << " Unsupported callback domain : " << domain; - return nullptr; - } -} - -bool CuptiCallbackApi::registerCallback( - CUpti_CallbackDomain domain, - CuptiCallBackID cbid, - CuptiCallbackFn cbfn) { - CallbackList* cblist = callbacks_.lookup(domain, cbid); - - if (!cblist) { - LOG(WARNING) << "Could not register callback -- domain = " << domain - << " callback id = " << cbid; - return false; - } - - // avoid duplicates - auto it = std::find(cblist->begin(), cblist->end(), cbfn); - if (it != cblist->end()) { - LOG(WARNING) << "Adding duplicate callback -- domain = " << domain - << " callback id = " << cbid; - return true; - } - - if (cblist->size() == MAX_CB_FNS_PER_CB) { - LOG(WARNING) << "Already registered max callback -- domain = " << domain - << " callback id = " << cbid; - } - - WriteLockGuard wl(callbackLock_); - cblist->push_back(cbfn); - return true; -} - -bool CuptiCallbackApi::deleteCallback( - CUpti_CallbackDomain domain, - CuptiCallBackID cbid, - CuptiCallbackFn cbfn) { - CallbackList* cblist = callbacks_.lookup(domain, cbid); - if (!cblist) { - LOG(WARNING) << "Attempting to remove unsupported callback -- domain = " << domain - << " callback id = " << cbid; - return false; - } - - // Locks are not required here as - // https://en.cppreference.com/w/cpp/container/list/erase - // "References and iterators to the erased elements are invalidated. - // Other references and iterators are not affected." - auto it = std::find(cblist->begin(), cblist->end(), cbfn); - if (it == cblist->end()) { - LOG(WARNING) << "Could not find callback to remove -- domain = " << domain - << " callback id = " << cbid; - return false; - } - - WriteLockGuard wl(callbackLock_); - cblist->erase(it); - return true; -} - -bool CuptiCallbackApi::enableCallback( - CUpti_CallbackDomain domain, CUpti_CallbackId cbid) { -#ifdef HAS_CUPTI - if (initSuccess_) { - lastCuptiStatus_ = CUPTI_CALL_NOWARN( - cuptiEnableCallback(1, subscriber_, domain, cbid)); - return (lastCuptiStatus_ == CUPTI_SUCCESS); - } -#endif - return false; -} - -bool CuptiCallbackApi::disableCallback( - CUpti_CallbackDomain domain, CUpti_CallbackId cbid) { -#ifdef HAS_CUPTI - if (initSuccess_) { - lastCuptiStatus_ = CUPTI_CALL_NOWARN( - cuptiEnableCallback(0, subscriber_, domain, cbid)); - return (lastCuptiStatus_ == CUPTI_SUCCESS); - } -#endif - return false; -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.h b/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.h deleted file mode 100644 index 4526f3750b4..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApi.h +++ /dev/null @@ -1,130 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#ifdef HAS_CUPTI -#include -#endif -#include -#include -#include -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "CuptiCallbackApiMock.h" - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - - -/* CuptiCallbackApi : Provides an abstraction over CUPTI callback - * interface. This enables various callback functions to be registered - * with this class. The class registers a global callback handler that - * redirects to the respective callbacks. - * - * Note: one design choice we made is to only support simple function pointers - * in order to speed up the implementation for fast path. - */ - -using CuptiCallbackFn = void(*)( - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData* cbInfo); - - -class CuptiCallbackApi { - - public: - - /* Global list of supported callback ids - * use the class namespace to avoid confusing with CUPTI enums*/ - enum CuptiCallBackID { - CUDA_LAUNCH_KERNEL = 0, - // can possibly support more callback ids per domain - // - __RUNTIME_CB_DOMAIN_START = CUDA_LAUNCH_KERNEL, - - // Callbacks under Resource CB domain - RESOURCE_CONTEXT_CREATED, - RESOURCE_CONTEXT_DESTROYED, - - __RUNTIME_CB_DOMAIN_END = RESOURCE_CONTEXT_CREATED, - __RESOURCE_CB_DOMAIN_START = RESOURCE_CONTEXT_CREATED, - - __RESOURCE_CB_DOMAIN_END = RESOURCE_CONTEXT_DESTROYED + 1, - }; - - - CuptiCallbackApi(const CuptiCallbackApi&) = delete; - CuptiCallbackApi& operator=(const CuptiCallbackApi&) = delete; - - static CuptiCallbackApi& singleton(); - - bool initSuccess() const { - return initSuccess_; - } - -#ifdef HAS_CUPTI - CUptiResult getCuptiStatus() const { - return lastCuptiStatus_; - } -#endif - - bool registerCallback( - CUpti_CallbackDomain domain, - CuptiCallBackID cbid, - CuptiCallbackFn cbfn); - - // returns false if callback was not found - bool deleteCallback( - CUpti_CallbackDomain domain, - CuptiCallBackID cbid, - CuptiCallbackFn cbfn); - - bool enableCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid); - bool disableCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid); - - - // Please do not use this method. This has to be exposed as public - // so it is accessible from the callback handler - void __callback_switchboard( - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData* cbInfo); - - private: - - explicit CuptiCallbackApi(); - - // For callback table design overview see the .cpp file - using CallbackList = std::list; - - // level 2 tables sizes are known at compile time - constexpr static size_t RUNTIME_CB_DOMAIN_SIZE - = (__RUNTIME_CB_DOMAIN_END - __RUNTIME_CB_DOMAIN_START); - - constexpr static size_t RESOURCE_CB_DOMAIN_SIZE - = (__RESOURCE_CB_DOMAIN_END - __RESOURCE_CB_DOMAIN_START); - - // level 1 table is a struct - struct CallbackTable { - std::array runtime; - std::array resource; - - CallbackList* lookup(CUpti_CallbackDomain domain, CuptiCallBackID cbid); - }; - - CallbackTable callbacks_; - bool initSuccess_ = false; - -#ifdef HAS_CUPTI - CUptiResult lastCuptiStatus_; - CUpti_SubscriberHandle subscriber_; -#endif -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApiMock.h b/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApiMock.h deleted file mode 100644 index fd51267274f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiCallbackApiMock.h +++ /dev/null @@ -1,32 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -// Provides data structures to mock CUPTI Callback API -#ifndef HAS_CUPTI - -enum CUpti_CallbackDomain { - CUPTI_CB_DOMAIN_RESOURCE, - CUPTI_CB_DOMAIN_RUNTIME_API, -}; -enum CUpti_CallbackId { - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, - CUPTI_CBID_RESOURCE_CONTEXT_CREATED, - CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, -}; - -using CUcontext = void*; - -struct CUpti_ResourceData { - CUcontext context; -}; - -constexpr int CUPTI_API_ENTER = 0; -constexpr int CUPTI_API_EXIT = 0; - -struct CUpti_CallbackData { - CUcontext context; - const char* symbolName; - int callbackSite; -}; -#endif // HAS_CUPTI diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.cpp deleted file mode 100644 index 7f1d48c1d00..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "CuptiEventApi.h" - -#include - -#include "Logger.h" -#include "cupti_call.h" - -using namespace std::chrono; -using std::vector; - -namespace KINETO_NAMESPACE { - -CuptiEventApi::CuptiEventApi(CUcontext context) - : context_(context) { - CUPTI_CALL(cuptiGetDeviceId(context_, (uint32_t*)&device_)); -} - -CUpti_EventGroupSets* CuptiEventApi::createGroupSets( - vector& ids) { - CUpti_EventGroupSets* group_sets = nullptr; - CUptiResult res = CUPTI_CALL(cuptiEventGroupSetsCreate( - context_, sizeof(CUpti_EventID) * ids.size(), ids.data(), &group_sets)); - - if (res != CUPTI_SUCCESS || group_sets == nullptr) { - const char* errstr = nullptr; - CUPTI_CALL(cuptiGetResultString(res, &errstr)); - throw std::system_error(EINVAL, std::generic_category(), errstr); - } - - return group_sets; -} - -void CuptiEventApi::destroyGroupSets(CUpti_EventGroupSets* sets) { - CUPTI_CALL(cuptiEventGroupSetsDestroy(sets)); -} - -bool CuptiEventApi::setContinuousMode() { - // Avoid logging noise for CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED - CUptiResult res = CUPTI_CALL_NOWARN(cuptiSetEventCollectionMode( - context_, CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS)); - if (res == CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED) { - return false; - } - // Log warning on other errors - CUPTI_CALL(res); - return (res == CUPTI_SUCCESS); -} - -void CuptiEventApi::enablePerInstance(CUpti_EventGroup eventGroup) { - uint32_t profile_all = 1; - CUPTI_CALL(cuptiEventGroupSetAttribute( - eventGroup, - CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, - sizeof(profile_all), - &profile_all)); -} - -uint32_t CuptiEventApi::instanceCount(CUpti_EventGroup eventGroup) { - uint32_t instance_count = 0; - size_t s = sizeof(instance_count); - CUPTI_CALL(cuptiEventGroupGetAttribute( - eventGroup, CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, &s, &instance_count)); - return instance_count; -} - -void CuptiEventApi::enableGroupSet(CUpti_EventGroupSet& set) { - CUptiResult res = CUPTI_CALL_NOWARN(cuptiEventGroupSetEnable(&set)); - if (res != CUPTI_SUCCESS) { - const char* errstr = nullptr; - CUPTI_CALL(cuptiGetResultString(res, &errstr)); - throw std::system_error(EIO, std::generic_category(), errstr); - } -} - -void CuptiEventApi::disableGroupSet(CUpti_EventGroupSet& set) { - CUPTI_CALL(cuptiEventGroupSetDisable(&set)); -} - -void CuptiEventApi::readEvent( - CUpti_EventGroup grp, - CUpti_EventID id, - vector& vals) { - size_t s = sizeof(int64_t) * vals.size(); - CUPTI_CALL(cuptiEventGroupReadEvent( - grp, - CUPTI_EVENT_READ_FLAG_NONE, - id, - &s, - reinterpret_cast(vals.data()))); -} - -vector CuptiEventApi::eventsInGroup(CUpti_EventGroup grp) { - uint32_t group_size = 0; - size_t s = sizeof(group_size); - CUPTI_CALL(cuptiEventGroupGetAttribute( - grp, CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, &s, &group_size)); - size_t events_size = group_size * sizeof(CUpti_EventID); - vector res(group_size); - CUPTI_CALL(cuptiEventGroupGetAttribute( - grp, CUPTI_EVENT_GROUP_ATTR_EVENTS, &events_size, res.data())); - return res; -} - -CUpti_EventID CuptiEventApi::eventId(const std::string& name) { - CUpti_EventID id{0}; - CUPTI_CALL(cuptiEventGetIdFromName(device_, name.c_str(), &id)); - return id; -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.h b/plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.h deleted file mode 100644 index 79610f93f0e..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiEventApi.h +++ /dev/null @@ -1,49 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include - -namespace KINETO_NAMESPACE { - -// C++ interface to CUPTI Events C API. -// Virtual methods are here mainly to allow easier testing. -class CuptiEventApi { - public: - explicit CuptiEventApi(CUcontext context_); - virtual ~CuptiEventApi() {} - - CUdevice device() { - return device_; - } - - virtual CUpti_EventGroupSets* createGroupSets( - std::vector& ids); - virtual void destroyGroupSets(CUpti_EventGroupSets* sets); - - virtual bool setContinuousMode(); - - virtual void enablePerInstance(CUpti_EventGroup eventGroup); - virtual uint32_t instanceCount(CUpti_EventGroup eventGroup); - - virtual void enableGroupSet(CUpti_EventGroupSet& set); - virtual void disableGroupSet(CUpti_EventGroupSet& set); - - virtual void - readEvent(CUpti_EventGroup g, CUpti_EventID id, std::vector& vals); - virtual std::vector eventsInGroup(CUpti_EventGroup g); - - virtual CUpti_EventID eventId(const std::string& name); - - protected: - // Unit testing - CuptiEventApi() : context_(nullptr), device_(0) {} - - private: - CUcontext context_; - CUdevice device_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.cpp deleted file mode 100644 index 36401e74341..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "CuptiMetricApi.h" - -#include - -#include "Logger.h" -#include "cupti_call.h" - -using namespace std::chrono; -using std::vector; - -namespace KINETO_NAMESPACE { - -CUpti_MetricID CuptiMetricApi::idFromName(const std::string& name) { - CUpti_MetricID metric_id{~0u}; - CUptiResult res = - CUPTI_CALL(cuptiMetricGetIdFromName(device_, name.c_str(), &metric_id)); - if (res == CUPTI_ERROR_INVALID_METRIC_NAME) { - LOG(WARNING) << "Invalid metric name: " << name; - } - return metric_id; -} - -// Return a map of event IDs and names for a given metric id. -// Note that many events don't have a name. In that case the name will -// be set to the empty string. -std::map CuptiMetricApi::events( - CUpti_MetricID metric_id) { - uint32_t num_events = 0; - CUPTI_CALL(cuptiMetricGetNumEvents(metric_id, &num_events)); - vector ids(num_events); - size_t array_size = num_events * sizeof(CUpti_EventID); - CUPTI_CALL(cuptiMetricEnumEvents(metric_id, &array_size, ids.data())); - std::map res; - for (CUpti_EventID id : ids) { - // Attempt to lookup name from CUPTI - constexpr size_t kMaxEventNameLength = 64; - char cupti_name[kMaxEventNameLength]; - size_t size = kMaxEventNameLength; - CUPTI_CALL( - cuptiEventGetAttribute(id, CUPTI_EVENT_ATTR_NAME, &size, cupti_name)); - cupti_name[kMaxEventNameLength - 1] = 0; - - // CUPTI "helpfully" returns "event_name" when the event is unnamed. - if (size > 0 && strcmp(cupti_name, "event_name") != 0) { - res.emplace(id, cupti_name); - } else { - res.emplace(id, ""); - } - } - return res; -} - -CUpti_MetricValueKind CuptiMetricApi::valueKind(CUpti_MetricID metric) { - CUpti_MetricValueKind res{CUPTI_METRIC_VALUE_KIND_FORCE_INT}; - size_t value_kind_size = sizeof(res); - CUPTI_CALL(cuptiMetricGetAttribute( - metric, CUPTI_METRIC_ATTR_VALUE_KIND, &value_kind_size, &res)); - return res; -} - -CUpti_MetricEvaluationMode CuptiMetricApi::evaluationMode( - CUpti_MetricID metric) { - CUpti_MetricEvaluationMode eval_mode{ - CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE}; - size_t eval_mode_size = sizeof(eval_mode); - CUPTI_CALL(cuptiMetricGetAttribute( - metric, CUPTI_METRIC_ATTR_EVALUATION_MODE, &eval_mode_size, &eval_mode)); - return eval_mode; -} - -// FIXME: Consider caching value kind here -SampleValue CuptiMetricApi::calculate( - CUpti_MetricID metric, - CUpti_MetricValueKind kind, - vector& events, - vector& values, - int64_t duration) { - CUpti_MetricValue metric_value; - CUPTI_CALL(cuptiMetricGetValue( - device_, - metric, - events.size() * sizeof(CUpti_EventID), - events.data(), - values.size() * sizeof(int64_t), - reinterpret_cast(values.data()), - duration, - &metric_value)); - - switch (kind) { - case CUPTI_METRIC_VALUE_KIND_DOUBLE: - case CUPTI_METRIC_VALUE_KIND_PERCENT: - return SampleValue(metric_value.metricValueDouble); - case CUPTI_METRIC_VALUE_KIND_UINT64: - case CUPTI_METRIC_VALUE_KIND_INT64: - case CUPTI_METRIC_VALUE_KIND_THROUGHPUT: - return SampleValue(metric_value.metricValueUint64); - case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL: - return SampleValue((int)metric_value.metricValueUtilizationLevel); - default: - assert(false); - } - return SampleValue(-1); -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.h b/plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.h deleted file mode 100644 index f45d38cd616..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiMetricApi.h +++ /dev/null @@ -1,38 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -#include -#include - -#include "SampleListener.h" - -namespace KINETO_NAMESPACE { - -// C++ interface to CUPTI Metrics C API. -// Virtual methods are here mainly to allow easier testing. -class CuptiMetricApi { - public: - explicit CuptiMetricApi(CUdevice device) : device_(device) {} - virtual ~CuptiMetricApi() {} - - virtual CUpti_MetricID idFromName(const std::string& name); - virtual std::map events(CUpti_MetricID metric_id); - - virtual CUpti_MetricValueKind valueKind(CUpti_MetricID metric); - virtual CUpti_MetricEvaluationMode evaluationMode(CUpti_MetricID metric); - - virtual SampleValue calculate( - CUpti_MetricID metric, - CUpti_MetricValueKind kind, - std::vector& events, - std::vector& values, - int64_t duration); - - private: - CUdevice device_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.cpp deleted file mode 100644 index d1b08ab2c13..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.cpp +++ /dev/null @@ -1,504 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#ifdef HAS_CUPTI -#include -#if defined(CUDART_VERSION) && CUDART_VERSION > 10000 && CUDART_VERSION < 11040 -#include -#include -#include -#endif // cuda version > 10.00 and < 11.04 -#endif // HAS_CUPTI - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "ScopeExit.h" -#include "CuptiNvPerfMetric.h" -#include "Logger.h" - -namespace KINETO_NAMESPACE { - -// Add a namespace to isolate these utility functions that are only -// going to be used by the CuptiRangeProfiler. These included calls -// to NVIDIA PerfWorks APIs. -namespace nvperf { - - -// Largely based on NVIDIA sample code provided with CUDA release -// files Metric.cpp and Eval.cpp - -// ------------------------------------------------- -// Metric and Counter Data Configuration -// ------------------------------------------------- - - -// Note: Be carful before modifying the code below. There is a specific -// sequence one needs to follow to program the metrics else things may -// stop working. We tried to keep the flow consistent with the example -// code from NVIDIA. Since most of the programmability comes from -// the CUPTI profiler metric names this should be okay. - -// Only supported on CUDA RT Version between 10.0 and 11.04. -// After CUDA RT 11.04, the structure has changed. -// TODO update the structure NVPA_RawMetricsConfig to support 11.04 -#if defined(CUDART_VERSION) && CUDART_VERSION > 10000 && CUDART_VERSION < 11040 - -bool getRawMetricRequests( - NVPA_MetricsContext* metricsContext, - std::vector metricNames, - std::vector& rawMetricsDeps, - std::vector& rawMetricRequests) { - bool isolated = true; - /* Bug in collection with collection of metrics without instances, keep it - * to true*/ - bool keepInstances = true; - - for (const auto& metricName : metricNames) { - - NVPW_MetricsContext_GetMetricProperties_Begin_Params - getMetricPropertiesBeginParams = { - NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE, nullptr}; - getMetricPropertiesBeginParams.pMetricsContext = metricsContext; - getMetricPropertiesBeginParams.pMetricName = metricName.c_str(); - - if (!NVPW_CALL( - NVPW_MetricsContext_GetMetricProperties_Begin( - &getMetricPropertiesBeginParams))) { - return false; - } - - for (const char** metricDepsIt = - getMetricPropertiesBeginParams.ppRawMetricDependencies; - *metricDepsIt; - ++metricDepsIt) { - rawMetricsDeps.push_back(*metricDepsIt); - } - - NVPW_MetricsContext_GetMetricProperties_End_Params - getMetricPropertiesEndParams = { - NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE, nullptr}; - getMetricPropertiesEndParams.pMetricsContext = metricsContext; - - if (!NVPW_CALL(NVPW_MetricsContext_GetMetricProperties_End( - &getMetricPropertiesEndParams))) { - return false; - } - } - - for (const auto& rawMetricName : rawMetricsDeps) { - NVPA_RawMetricRequest metricRequest = {NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE, nullptr}; - metricRequest.pMetricName = rawMetricName.c_str(); - metricRequest.isolated = isolated; - metricRequest.keepInstances = keepInstances; - rawMetricRequests.push_back(metricRequest); - VLOG(1) << "Adding raw metric struct : raw metric = " << rawMetricName - << " isolated = " << isolated << " keepinst = " << keepInstances; - } - - if (rawMetricRequests.size() == 0) { - LOG(WARNING) << "CUPTI Profiler was unable to configure any metrics"; - return false; - } - return true; -} - -// Setup CUPTI Profiler Config Image -bool getProfilerConfigImage( - const std::string& chipName, - const std::vector& metricNames, - std::vector& configImage, - const uint8_t* counterAvailabilityImage) { - - NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { - NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr}; - metricsContextCreateParams.pChipName = chipName.c_str(); - - if (!NVPW_CALL( - NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) { - return false; - } - - NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { - NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, nullptr}; - metricsContextDestroyParams.pMetricsContext = - metricsContextCreateParams.pMetricsContext; - - SCOPE_EXIT([&]() { - NVPW_MetricsContext_Destroy( - (NVPW_MetricsContext_Destroy_Params*)&metricsContextDestroyParams); - }); - - // Get all raw metrics required for given metricNames list - std::vector rawMetricRequests; - - // note: we need a variable at this functions scope to hold the string - // pointers for underlying C char arrays. - std::vector rawMetricDeps; - - if (!getRawMetricRequests( - metricsContextCreateParams.pMetricsContext, - metricNames, - rawMetricDeps, - rawMetricRequests)) { - return false; - } - - NVPA_RawMetricsConfigOptions metricsConfigOptions = { - NVPA_RAW_METRICS_CONFIG_OPTIONS_STRUCT_SIZE, nullptr}; - metricsConfigOptions.activityKind = NVPA_ACTIVITY_KIND_PROFILER; - metricsConfigOptions.pChipName = chipName.c_str(); - NVPA_RawMetricsConfig* rawMetricsConfig; - if (!NVPW_CALL( - NVPA_RawMetricsConfig_Create( - &metricsConfigOptions, &rawMetricsConfig))) { - return false; - } - - // TODO check if this is required - if (counterAvailabilityImage) { - NVPW_RawMetricsConfig_SetCounterAvailability_Params - setCounterAvailabilityParams = { - NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE, nullptr}; - setCounterAvailabilityParams.pRawMetricsConfig = rawMetricsConfig; - setCounterAvailabilityParams.pCounterAvailabilityImage = - counterAvailabilityImage; - if (!NVPW_CALL( - NVPW_RawMetricsConfig_SetCounterAvailability( - &setCounterAvailabilityParams))) { - return false; - } - } - - NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = { - NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE, nullptr}; - rawMetricsConfigDestroyParams.pRawMetricsConfig = rawMetricsConfig; - SCOPE_EXIT([&]() { - NVPW_RawMetricsConfig_Destroy( - (NVPW_RawMetricsConfig_Destroy_Params*)&rawMetricsConfigDestroyParams); - }); - - // Start a Raw Metric Pass group - NVPW_RawMetricsConfig_BeginPassGroup_Params beginPassGroupParams = { - NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE, nullptr}; - beginPassGroupParams.pRawMetricsConfig = rawMetricsConfig; - if (!NVPW_CALL( - NVPW_RawMetricsConfig_BeginPassGroup(&beginPassGroupParams))) { - return false; - } - - // Add all raw metrics - NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = { - NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE, nullptr}; - addMetricsParams.pRawMetricsConfig = rawMetricsConfig; - addMetricsParams.pRawMetricRequests = rawMetricRequests.data(); - addMetricsParams.numMetricRequests = rawMetricRequests.size(); - if (!NVPW_CALL( - NVPW_RawMetricsConfig_AddMetrics(&addMetricsParams))) { - return false; - } - - // End pass group - NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = { - NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE, nullptr}; - endPassGroupParams.pRawMetricsConfig = rawMetricsConfig; - if (!NVPW_CALL( - NVPW_RawMetricsConfig_EndPassGroup(&endPassGroupParams))) { - return false; - } - - // Setup Config Image generation - NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = { - NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE, nullptr}; - generateConfigImageParams.pRawMetricsConfig = rawMetricsConfig; - if (!NVPW_CALL( - NVPW_RawMetricsConfig_GenerateConfigImage(&generateConfigImageParams))) { - return false; - } - - // Get the Config Image size... nearly there - NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = { - NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE, nullptr}; - getConfigImageParams.pRawMetricsConfig = rawMetricsConfig; - getConfigImageParams.bytesAllocated = 0; - getConfigImageParams.pBuffer = nullptr; - if (!NVPW_CALL( - NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) { - return false; - } - - configImage.resize(getConfigImageParams.bytesCopied); - - // Write the Config image binary - getConfigImageParams.bytesAllocated = configImage.size(); - getConfigImageParams.pBuffer = configImage.data(); - if (!NVPW_CALL( - NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) { - return false; - } - - return true; -} - -bool getCounterDataPrefixImage( - const std::string& chipName, - const std::vector& metricNames, - std::vector& counterDataImagePrefix) { - - NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { - NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr}; - metricsContextCreateParams.pChipName = chipName.c_str(); - - if (!NVPW_CALL( - NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) { - return false; - } - - NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { - NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, nullptr}; - metricsContextDestroyParams.pMetricsContext = - metricsContextCreateParams.pMetricsContext; - - - SCOPE_EXIT([&]() { - NVPW_MetricsContext_Destroy( - (NVPW_MetricsContext_Destroy_Params*)&metricsContextDestroyParams); - }); - - // Get all raw metrics required for given metricNames list - std::vector rawMetricRequests; - - // note: we need a variable at this functions scope to hold the string - // pointers for underlying C char arrays. - std::vector rawMetricDeps; - - if (!getRawMetricRequests( - metricsContextCreateParams.pMetricsContext, - metricNames, - rawMetricDeps, - rawMetricRequests)) { - return false; - } - - // Setup Counter Data builder - NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = { - NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE, nullptr}; - counterDataBuilderCreateParams.pChipName = chipName.c_str(); - if (!NVPW_CALL( - NVPW_CounterDataBuilder_Create(&counterDataBuilderCreateParams))) { - return false; - } - - NVPW_CounterDataBuilder_Destroy_Params counterDataBuilderDestroyParams = { - NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE, nullptr}; - counterDataBuilderDestroyParams.pCounterDataBuilder = - counterDataBuilderCreateParams.pCounterDataBuilder; - SCOPE_EXIT([&]() { - NVPW_CounterDataBuilder_Destroy(( - NVPW_CounterDataBuilder_Destroy_Params*)&counterDataBuilderDestroyParams); - }); - - // Add metrics to counter data image prefix - NVPW_CounterDataBuilder_AddMetrics_Params addMetricsParams = { - NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE, nullptr}; - addMetricsParams.pCounterDataBuilder = - counterDataBuilderCreateParams.pCounterDataBuilder; - addMetricsParams.pRawMetricRequests = rawMetricRequests.data(); - addMetricsParams.numMetricRequests = rawMetricRequests.size(); - if (!NVPW_CALL( - NVPW_CounterDataBuilder_AddMetrics(&addMetricsParams))) { - return false; - } - - // Get image prefix size - NVPW_CounterDataBuilder_GetCounterDataPrefix_Params - getCounterDataPrefixParams = { - NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE, nullptr}; - getCounterDataPrefixParams.pCounterDataBuilder = - counterDataBuilderCreateParams.pCounterDataBuilder; - getCounterDataPrefixParams.bytesAllocated = 0; - getCounterDataPrefixParams.pBuffer = nullptr; - if (!NVPW_CALL( - NVPW_CounterDataBuilder_GetCounterDataPrefix( - &getCounterDataPrefixParams))) { - return false; - } - - counterDataImagePrefix.resize(getCounterDataPrefixParams.bytesCopied); - - // Now write counter data image prefix - getCounterDataPrefixParams.bytesAllocated = counterDataImagePrefix.size(); - getCounterDataPrefixParams.pBuffer = counterDataImagePrefix.data(); - if (!NVPW_CALL( - NVPW_CounterDataBuilder_GetCounterDataPrefix( - &getCounterDataPrefixParams))) { - return false; - } - - return true; -} - -// ------------------------------------------------- -// Metric and Counter Evaluation Utilities -// ------------------------------------------------- - -std::string getRangeDescription( - const std::vector& counterDataImage, - int rangeIndex) { - std::vector descriptionPtrs; - - NVPW_Profiler_CounterData_GetRangeDescriptions_Params getRangeDescParams = { - NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE, nullptr}; - getRangeDescParams.pCounterDataImage = counterDataImage.data(); - getRangeDescParams.rangeIndex = rangeIndex; - - if (!NVPW_CALL( - NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams))) { - return ""; - } - - descriptionPtrs.resize(getRangeDescParams.numDescriptions); - getRangeDescParams.ppDescriptions = descriptionPtrs.data(); - - if (!NVPW_CALL( - NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams))) { - return ""; - } - - std::string rangeName; - - for (size_t i = 0; i < getRangeDescParams.numDescriptions; i++) { - if (i > 0) { - rangeName.append("/"); - } - rangeName.append(descriptionPtrs[i]); - } - return rangeName; -} - -CuptiProfilerResult evalMetricValues( - const std::string& chipName, - const std::vector& counterDataImage, - const std::vector& metricNames, - bool verbose) { - - if (!counterDataImage.size()) { - LOG(ERROR) << "Counter Data Image is empty!"; - return {}; - } - - NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { - NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr}; - metricsContextCreateParams.pChipName = chipName.c_str(); - if (!NVPW_CALL( - NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) { - return {}; - } - - NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { - NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, nullptr}; - metricsContextDestroyParams.pMetricsContext = - metricsContextCreateParams.pMetricsContext; - SCOPE_EXIT([&]() { - NVPW_MetricsContext_Destroy( - (NVPW_MetricsContext_Destroy_Params*)&metricsContextDestroyParams); - }); - - NVPW_CounterData_GetNumRanges_Params getNumRangesParams = { - NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE, nullptr}; - getNumRangesParams.pCounterDataImage = counterDataImage.data(); - if (!NVPW_CALL( - NVPW_CounterData_GetNumRanges(&getNumRangesParams))) { - return {}; - } - - // TBD in the future support special chars in metric name - // for now these are default - const bool isolated = true; - - // API takes a 2D array of chars - std::vector metricNamePtrs; - - for (const auto& metric : metricNames) { - metricNamePtrs.push_back(metric.c_str()); - } - - CuptiProfilerResult result{ - .metricNames = metricNames}; - - for (size_t rangeIndex = 0; rangeIndex < getNumRangesParams.numRanges; - ++rangeIndex) { - - CuptiRangeMeasurement rangeData { - .rangeName = getRangeDescription(counterDataImage, rangeIndex)}; - rangeData.values.resize(metricNames.size()); - - // First set Counter data image with current range - NVPW_MetricsContext_SetCounterData_Params setCounterDataParams = { - NVPW_MetricsContext_SetCounterData_Params_STRUCT_SIZE, nullptr}; - - setCounterDataParams.pMetricsContext = - metricsContextCreateParams.pMetricsContext; - setCounterDataParams.pCounterDataImage = counterDataImage.data(); - setCounterDataParams.isolated = isolated; - setCounterDataParams.rangeIndex = rangeIndex; - - NVPW_CALL(NVPW_MetricsContext_SetCounterData(&setCounterDataParams)); - - - // Now we can evaluate GPU metrics - NVPW_MetricsContext_EvaluateToGpuValues_Params evalToGpuParams = { - NVPW_MetricsContext_EvaluateToGpuValues_Params_STRUCT_SIZE, nullptr}; - evalToGpuParams.pMetricsContext = - metricsContextCreateParams.pMetricsContext; - evalToGpuParams.numMetrics = metricNamePtrs.size(); - evalToGpuParams.ppMetricNames = metricNamePtrs.data(); - evalToGpuParams.pMetricValues = rangeData.values.data(); - - if (!NVPW_CALL(NVPW_MetricsContext_EvaluateToGpuValues(&evalToGpuParams))) { - LOG(WARNING) << "Failed to evaluate metris for range : " - << rangeData.rangeName; - continue; - } - - if (verbose) { - for (size_t i = 0; i < metricNames.size(); i++) { - LOG(INFO) << "rangeName: " << rangeData.rangeName - << "\tmetricName: " << metricNames[i] - << "\tgpuValue: " << rangeData.values[i]; - } - } - - result.rangeVals.emplace_back(std::move(rangeData)); - } - - return result; -} - -#else - -bool getProfilerConfigImage( - const std::string& /*chipName*/, - const std::vector& /*metricNames*/, - std::vector& /*configImage*/, - const uint8_t* /*counterAvailabilityImage*/) { - return false; -} - -bool getCounterDataPrefixImage( - const std::string& /*chipName*/, - const std::vector& /*metricNames*/, - std::vector& /*counterDataImagePrefix*/) { - return false; -} - -CuptiProfilerResult evalMetricValues( - const std::string& /*chipName*/, - const std::vector& /*counterDataImage*/, - const std::vector& /*metricNames*/, - bool /*verbose*/) { - return {}; -} - -#endif // cuda version > 10.00 and < 11.04 - -} // namespace nvperf -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.h b/plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.h deleted file mode 100644 index d5dd1b1c1d2..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiNvPerfMetric.h +++ /dev/null @@ -1,71 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "Logger.h" - -namespace KINETO_NAMESPACE { - -struct CuptiRangeMeasurement { - std::string rangeName; - std::vector values; -}; - -struct CuptiProfilerResult { - std::vector metricNames; - // rangeName, list values - std::vector rangeVals; -}; - -/* Utilities for CUPTI and NVIDIA PerfWorks Metric API - */ - -#define NVPW_CALL(call) \ - [&]() -> bool { \ - NVPA_Status _status_ = call; \ - if (_status_ != NVPA_STATUS_SUCCESS) { \ - LOG(WARNING) << fmt::format( \ - "function {} failed with error ({})", \ - #call, \ - (int)_status_); \ - return false; \ - } \ - return true; \ - }() - -// fixme - add a results string -// nvpperfGetResultString(_status_, &_errstr_); - -namespace nvperf { - -// Setup CUPTI profiler configuration blob and counter data image prefix -bool getProfilerConfigImage( - const std::string& chipName, - const std::vector& metricNames, - std::vector& configImage, - const uint8_t* counterAvailabilityImage = nullptr); - -// Setup CUPTI profiler configuration blob and counter data image prefix -bool getCounterDataPrefixImage( - const std::string& chipName, - const std::vector& metricNames, - std::vector& counterDataImagePrefix); - -/* NV Perf Metric Evaluation helpers - * - utilities to read binary data and obtain metrics for ranges - */ -CuptiProfilerResult evalMetricValues( - const std::string& chipName, - const std::vector& counterDataImage, - const std::vector& metricNames, - bool verbose = false); - - -} // namespace nvperf -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.cpp deleted file mode 100644 index e5f18ed7b0b..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.cpp +++ /dev/null @@ -1,751 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include -#ifdef HAS_CUPTI -#include -#include -#endif // HAS_CUPTI -#include -#include - -#ifdef HAS_CUPTI -#include "cupti_call.h" -#endif - -#include "time_since_epoch.h" -#include "Logger.h" -#include "Demangle.h" - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "CuptiCallbackApiMock.h" -#include "CuptiRangeProfilerApi.h" - -#if HAS_CUPTI_RANGE_PROFILER -#include -#include -#include "cupti_call.h" -#endif // HAS_CUPTI_RANGE_PROFILER - -namespace KINETO_NAMESPACE { - -#if HAS_CUPTI_RANGE_PROFILER -constexpr char kRootUserRangeName[] = "__profile__"; -constexpr int kCallbacksCountToFlush = 500; - -// Should we set Counter availability image ourselves? -// Disabled this right now as this call conflicts with DCGM -// It is not clear why it should conflict except it being a profiler API call -// TODO Revisit -constexpr bool kSetCounterAvail = false; - -// Shared state to track one Cupti Profiler API per Device -namespace { -// per device profiler maps -std::unordered_map profiler_map; -std::unordered_map enable_flag; -std::unordered_map disable_flag; - -std::mutex contextMutex_; -std::unordered_map ctx_to_dev; -std::set active_devices; -} - -// forward declarations -void __trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid); -void __trackCudaKernelLaunch(CUcontext ctx, const char* kernelName); - -/// Helper functions - -// Available raw counters -std::vector getCounterAvailiability(CUcontext cuContext) { - std::vector counterAvailabilityImage; - CUpti_Profiler_GetCounterAvailability_Params getCounterAvailabilityParams = { - CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE, nullptr}; - getCounterAvailabilityParams.ctx = cuContext; - CUPTI_CALL( - cuptiProfilerGetCounterAvailability(&getCounterAvailabilityParams)); - - counterAvailabilityImage.clear(); - counterAvailabilityImage.resize( - getCounterAvailabilityParams.counterAvailabilityImageSize); - - getCounterAvailabilityParams.pCounterAvailabilityImage = - counterAvailabilityImage.data(); - CUPTI_CALL( - cuptiProfilerGetCounterAvailability(&getCounterAvailabilityParams)); - - return counterAvailabilityImage; -} - -std::string getChipName(int deviceId) { - // Get chip name for the cuda device - CUpti_Device_GetChipName_Params getChipNameParams = { - CUpti_Device_GetChipName_Params_STRUCT_SIZE, nullptr}; - - getChipNameParams.deviceIndex = deviceId; - CUPTI_CALL(cuptiDeviceGetChipName(&getChipNameParams)); - - return getChipNameParams.pChipName; -} - -inline uint32_t getDevID(CUcontext ctx) { - uint32_t device_id = UINT32_MAX; - CUPTI_CALL(cuptiGetDeviceId(ctx, &device_id)); - if (device_id == UINT32_MAX) { - LOG(ERROR) << "Could not determine dev id for = " << ctx; - } - return device_id; -} - -// We use CUPTI Callback functions in three ways : -// 1. Track cuda contexts and maintain a list of active GPUs to profile -// 2. Callbacks on kernel launches to track the name of automatic -// ranges that correspond to names of kernels -// 3. Lastly CUPTI profiler has to be enabled on the same thread executing -// the CUDA kernels. We use Callbacks to enable the profiler -// asynchronously from another thread. - -void disableKernelCallbacks(); - -void trackCudaCtx( - CUpti_CallbackDomain /*domain*/, - CUpti_CallbackId cbid, - const CUpti_CallbackData* cbInfo) { - auto *d = reinterpret_cast(cbInfo); - auto ctx = d->context; - uint32_t device_id = getDevID(ctx); - - if (device_id == UINT32_MAX) { - return; - } - - __trackCudaCtx(ctx, device_id, cbid); -} - -void __trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid) { - std::lock_guard g(contextMutex_); - if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { - VLOG(0) << "CUPTI Profiler observed CUDA Context created = " - << ctx << " device id = " << device_id; - active_devices.insert(device_id); - if constexpr (kSetCounterAvail) { - if (active_devices.size() == 1) { - CuptiRBProfilerSession::setCounterAvailabilityImage( - getCounterAvailiability(ctx)); - } - } - ctx_to_dev[ctx] = device_id; - - } else if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { - VLOG(0) << "CUPTI Profiler observed CUDA Context destroyed = " - << ctx << " device id = " << device_id; - auto it = active_devices.find(device_id); - if (it != active_devices.end()) { - active_devices.erase(it); - ctx_to_dev.erase(ctx); - } - } -} - -void trackCudaKernelLaunch( - CUpti_CallbackDomain /*domain*/, - CUpti_CallbackId /*cbid*/, - const CUpti_CallbackData* cbInfo) { - VLOG(1) << " Trace : Callback name = " - << (cbInfo->symbolName ? cbInfo->symbolName: "") - << " context ptr = " << cbInfo->context; - auto ctx = cbInfo->context; - // should be in CUPTI_API_ENTER call site - if (cbInfo->callbackSite != CUPTI_API_ENTER) { - return; - } - __trackCudaKernelLaunch(ctx, cbInfo->symbolName); -} - -void __trackCudaKernelLaunch( - CUcontext ctx, - const char* kernelName) { - VLOG(0) << " Tracking kernel name = " << (kernelName ? kernelName : "") - << " context ptr = " << ctx; - - uint32_t device_id = 0; - auto it = ctx_to_dev.find(ctx); - if (it == ctx_to_dev.end()) { - // Warning here could be too noisy - VLOG(0) << " Could not find corresponding device to ctx = " << ctx; - return; - } else { - device_id = it->second; - } - - auto pit = profiler_map.find(device_id); - if (pit == profiler_map.end() || pit->second == nullptr) { - return; - } - auto profiler = pit->second; - - if (enable_flag[device_id]) { - LOG(INFO) << "Callback handler is enabling cupti profiler"; - profiler->startAndEnable(); - enable_flag[device_id] = false; - - } else if (disable_flag[device_id]) { - LOG(INFO) << "Callback handler is disabling cupti profiler"; - profiler->disableAndStop(); - return; - } - - if (profiler->curRange_ == CUPTI_AutoRange) { - profiler->logKernelName(kernelName ? kernelName : "__missing__"); - } - - /* TODO add per kernel time logging - if (measure_per_kernel) { - profiler->kernelStartTs_.push_back( - std::chrono::high_resolution_clock::now()); - } - */ - - // periodically flush profiler data from GPU - if (profiler->numCallbacks_ % kCallbacksCountToFlush == 0) { - profiler->flushCounterData(); - } - profiler->numCallbacks_++; -} - -void enableKernelCallbacks() { - auto& cbapi = CuptiCallbackApi::singleton(); - bool status = cbapi.enableCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); - if (!status) { - LOG(WARNING) << "CUPTI Range Profiler unable to " - << "enable cuda kernel launch callback"; - return; - } - LOG(INFO) << "CUPTI Profiler kernel callbacks enabled"; -} - -void disableKernelCallbacks() { - auto& cbapi = CuptiCallbackApi::singleton(); - bool status = cbapi.disableCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); - if (!status) { - LOG(WARNING) << "CUPTI Range Profiler unable to " - << "disable cuda kernel launch callback"; - return; - } - LOG(INFO) << "CUPTI Profiler kernel callbacks disabled"; -} - -// static -std::set CuptiRBProfilerSession::getActiveDevices() { - std::lock_guard g(contextMutex_); - return active_devices; -} - -// static -void CuptiRBProfilerSession::initCupti() { - CUpti_Profiler_Initialize_Params profilerInitializeParams = { - CUpti_Profiler_Initialize_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerInitialize(&profilerInitializeParams)); -} - -// static -void CuptiRBProfilerSession::deInitCupti() { - CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = { - CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams)); -} - -// static -void CuptiRBProfilerSession::staticInit() { - CuptiRBProfilerSession::initCupti(); - - // Register CUPTI callbacks - auto& cbapi = CuptiCallbackApi::singleton(); - CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE; - bool status = cbapi.registerCallback( - domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, trackCudaCtx); - status = status && cbapi.registerCallback( - domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, trackCudaCtx); - status = status && cbapi.enableCallback( - domain, CUPTI_CBID_RESOURCE_CONTEXT_CREATED); - status = status && cbapi.enableCallback( - domain, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); - - if (!status) { - LOG(WARNING) << "CUPTI Range Profiler unable to attach cuda context " - << "create and destroy callbacks"; - CUPTI_CALL(cbapi.getCuptiStatus()); - return; - } - - domain = CUPTI_CB_DOMAIN_RUNTIME_API; - status = cbapi.registerCallback( - domain, CuptiCallbackApi::CUDA_LAUNCH_KERNEL, trackCudaKernelLaunch); - - if (!status) { - LOG(WARNING) << "CUPTI Range Profiler unable to attach cuda kernel " - << "launch callback"; - return; - } -} - -// static -std::vector& CuptiRBProfilerSession::counterAvailabilityImage() { - static std::vector counterAvailabilityImage_; - return counterAvailabilityImage_; -} - - -// Setup the profiler sessions -CuptiRBProfilerSession::CuptiRBProfilerSession( - const std::vector& metricNames, - int deviceId, - int maxRanges, - int numNestingLevels, - CUcontext cuContext) - : metricNames_(metricNames), - chipName_(getChipName(deviceId)), - deviceId_(deviceId), - maxRanges_(maxRanges), - numNestingLevels_(numNestingLevels), - cuContext_(cuContext) { - CuptiRBProfilerSession::initCupti(); - - LOG(INFO) << "Initializing CUPTI profiler session : device = " << deviceId - << " chip = " << chipName_; - /* Generate configuration for metrics, this can also be done offline*/ - NVPW_InitializeHost_Params initializeHostParams = { - NVPW_InitializeHost_Params_STRUCT_SIZE, nullptr}; - NVPW_CALL(NVPW_InitializeHost(&initializeHostParams)); - - if (metricNames.size()) { - if (!nvperf::getProfilerConfigImage( - chipName_, - metricNames, - configImage, - CuptiRBProfilerSession::counterAvailabilityImage().data())) { - LOG(ERROR) << "Failed to create configImage or counterDataImagePrefix"; - return; - } - if (!nvperf::getCounterDataPrefixImage( - chipName_, - metricNames, - counterDataImagePrefix)) { - LOG(ERROR) << "Failed to create counterDataImagePrefix"; - return; - } - } else { - LOG(ERROR) << "No metrics provided to profile"; - return; - } - - if (!createCounterDataImage()) { - LOG(ERROR) << "Failed to create counterDataImage"; - return; - } - - LOG(INFO) << "Size of structs\n" - << " config image size = " << configImage.size() << " B" - << " counter data image prefix = " - << counterDataImagePrefix.size() << " B" - << " counter data image size = " << counterDataImage.size() / 1024 - << " KB" - << " counter sb image size = " - << counterDataScratchBuffer.size() << " B"; - - beginPassParams_ = {CUpti_Profiler_BeginPass_Params_STRUCT_SIZE, nullptr}; - endPassParams_ = {CUpti_Profiler_EndPass_Params_STRUCT_SIZE, nullptr}; - - initSuccess_ = true; - profiler_map[deviceId] = this; -} - -// used in unittests only -CuptiRBProfilerSession::CuptiRBProfilerSession(int deviceId, CUcontext ctx) - : deviceId_(deviceId), cuContext_(ctx) { - initSuccess_ = true; - profiler_map[deviceId] = this; -} - -void CuptiRBProfilerSession::startInternal( - CUpti_ProfilerRange profilerRange, - CUpti_ProfilerReplayMode profilerReplayMode) { - LOG(INFO) << "Starting profiler session: profiler range = " - << ((profilerRange == CUPTI_AutoRange) ? "autorange" : "userrange") - << " replay mode = " - << ((profilerReplayMode == CUPTI_KernelReplay) ? "kernel" : "user"); - if (!initSuccess_) { - LOG(WARNING) << __func__ << "() bailing out since initialization failed"; - return; - } - - if (cuContext_ == nullptr) { - for (const auto& it : ctx_to_dev) { - if (it.second == deviceId_) { - cuContext_ = it.first; - break; - } - } - LOG(INFO) << " Cupti Profiler using CUDA context = " << cuContext_; - } - - profilerStartTs_ = std::chrono::high_resolution_clock::now(); - curRange_ = profilerRange; - curReplay_ = profilerReplayMode; - - CUpti_Profiler_BeginSession_Params beginSessionParams = { - CUpti_Profiler_BeginSession_Params_STRUCT_SIZE, nullptr}; - - beginSessionParams.ctx = cuContext_; - beginSessionParams.counterDataImageSize = counterDataImage.size(); - beginSessionParams.pCounterDataImage = counterDataImage.data(); - beginSessionParams.counterDataScratchBufferSize = - counterDataScratchBuffer.size(); - beginSessionParams.pCounterDataScratchBuffer = counterDataScratchBuffer.data(); - beginSessionParams.range = profilerRange; - beginSessionParams.replayMode = profilerReplayMode; - beginSessionParams.maxRangesPerPass = maxRanges_; - beginSessionParams.maxLaunchesPerPass = maxRanges_; - - auto status = CUPTI_CALL(cuptiProfilerBeginSession(&beginSessionParams)); - if (status != CUPTI_SUCCESS) { - LOG(WARNING) << "Failed to start CUPTI profiler"; - initSuccess_ = false; - return; - } - - // Set counter configuration - CUpti_Profiler_SetConfig_Params setConfigParams = { - CUpti_Profiler_SetConfig_Params_STRUCT_SIZE, nullptr}; - - setConfigParams.ctx = cuContext_; - setConfigParams.pConfig = configImage.data(); - setConfigParams.configSize = configImage.size(); - setConfigParams.passIndex = 0; - setConfigParams.minNestingLevel = 1; - setConfigParams.numNestingLevels = numNestingLevels_; - status = CUPTI_CALL(cuptiProfilerSetConfig(&setConfigParams)); - - if (status != CUPTI_SUCCESS) { - LOG(WARNING) << "Failed to configure CUPTI profiler"; - initSuccess_ = false; - return; - } - profilerInitDoneTs_ = std::chrono::high_resolution_clock::now(); - - if (curRange_ == CUPTI_AutoRange) { - enableKernelCallbacks(); - } - profilingActive_ = true; -} - -void CuptiRBProfilerSession::stop() { - if (!initSuccess_) { - LOG(WARNING) << __func__ << "() bailing out since initialization failed"; - return; - } - LOG(INFO) << "Stop profiler session on device = " << deviceId_; - - CUpti_Profiler_UnsetConfig_Params unsetConfigParams = { - CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerUnsetConfig(&unsetConfigParams)); - - CUpti_Profiler_EndSession_Params endSessionParams = { - CUpti_Profiler_EndSession_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerEndSession(&endSessionParams)); - - disableKernelCallbacks(); - - profilerStopTs_ = std::chrono::high_resolution_clock::now(); - profilingActive_ = false; -} - -void CuptiRBProfilerSession::beginPass() { - if (!initSuccess_) { - LOG(WARNING) << __func__ << "() bailing out since initialization failed"; - return; - } - CUPTI_CALL(cuptiProfilerBeginPass(&beginPassParams_)); -} - -bool CuptiRBProfilerSession::endPass() { - if (!initSuccess_) { - LOG(WARNING) << __func__ << "() bailing out since initialization failed"; - return true; - } - CUPTI_CALL(cuptiProfilerEndPass(&endPassParams_)); - return endPassParams_.allPassesSubmitted; -} - -void CuptiRBProfilerSession::flushCounterData() { - LOG(INFO) << "Flushing counter data on device = " << deviceId_; - CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = { - CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerFlushCounterData(&flushCounterDataParams)); -} - -/// Enable and disable the profiler -void CuptiRBProfilerSession::enable() { - if (!initSuccess_) { - LOG(WARNING) << __func__ << "() bailing out since initialization failed"; - return; - } - CUpti_Profiler_EnableProfiling_Params enableProfilingParams = { - CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerEnableProfiling(&enableProfilingParams)); -} - -void CuptiRBProfilerSession::disable() { - if (!initSuccess_) { - LOG(WARNING) << __func__ << "() bailing out since initialization failed"; - return; - } - CUpti_Profiler_DisableProfiling_Params disableProfilingParams = { - CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerDisableProfiling(&disableProfilingParams)); -} - -/// User range based profiling -void CuptiRBProfilerSession::pushRange(const std::string& rangeName) { - LOG(INFO) << " CUPTI pushrange ( " << rangeName << " )"; - CUpti_Profiler_PushRange_Params pushRangeParams = { - CUpti_Profiler_PushRange_Params_STRUCT_SIZE, nullptr}; - pushRangeParams.pRangeName = rangeName.c_str(); - CUPTI_CALL(cuptiProfilerPushRange(&pushRangeParams)); -} - -void CuptiRBProfilerSession::popRange() { - LOG(INFO) << " CUPTI pop range"; - CUpti_Profiler_PopRange_Params popRangeParams = { - CUpti_Profiler_PopRange_Params_STRUCT_SIZE, nullptr}; - CUPTI_CALL(cuptiProfilerPopRange(&popRangeParams)); -} - -void CuptiRBProfilerSession::startAndEnable() { - startInternal(curRange_, curReplay_); - if (curReplay_ == CUPTI_UserReplay) { - beginPass(); - } - enable(); - if (curRange_ == CUPTI_UserRange) { - pushRange(kRootUserRangeName); - } - enable_flag[deviceId_] = false; -} - -void CuptiRBProfilerSession::disableAndStop() { - if (curRange_ == CUPTI_UserRange) { - popRange(); - } - disable(); - if (curReplay_ == CUPTI_UserReplay) { - endPass(); - flushCounterData(); - } - stop(); - disable_flag[deviceId_] = false; -} - -void CuptiRBProfilerSession::asyncStartAndEnable( - CUpti_ProfilerRange profilerRange, - CUpti_ProfilerReplayMode profilerReplayMode) { - LOG(INFO) << "Starting CUPTI profiler asynchronously on device = " - << deviceId_ << " profiler range = " - << ((profilerRange == CUPTI_AutoRange) ? "autorange" : "userrange") - << " replay mode = " - << ((profilerReplayMode == CUPTI_KernelReplay) ? "kernel" : "user"); - curReplay_ = profilerReplayMode; - curRange_ = profilerRange; - enable_flag[deviceId_] = true; - enableKernelCallbacks(); -} - -void CuptiRBProfilerSession::asyncDisableAndStop() { - LOG(INFO) << "Stopping CUPTI profiler asynchronously on device = " - << deviceId_ << " cu context = " << cuContext_; - disable_flag[deviceId_] = true; -} - - -CuptiProfilerResult CuptiRBProfilerSession::evaluateMetrics( - bool verbose) { - if (!initSuccess_) { - LOG(WARNING) << "Profiling failed, no results to return"; - return {}; - } - if (profilingActive_) { - disableAndStop(); - } - - LOG(INFO) << "Total kernels logged = " << kernelNames_.size(); - if (verbose) { - for (const auto& kernel : kernelNames_) { - std::cout << demangle(kernel) << std::endl; - } - LOG(INFO) << "Profiler Range data : "; - } - - auto results = nvperf::evalMetricValues( - chipName_, counterDataImage, metricNames_, verbose /*verbose*/); - - // profiler end-end duration - auto duration_ms = std::chrono::duration_cast( - profilerStopTs_ - profilerStartTs_); - - auto init_dur_ms = std::chrono::duration_cast( - profilerInitDoneTs_ - profilerStartTs_); - LOG(INFO) << "Total profiler time = " << duration_ms.count() << " ms"; - LOG(INFO) << "Total profiler init time = " << init_dur_ms.count() << " ms"; - - return results; -} - -std::unique_ptr CuptiRBProfilerSession::getProfilerTraceSpan() { - return std::make_unique( - timeSinceEpoch(profilerStartTs_), - timeSinceEpoch(profilerStopTs_), - "__cupti_profiler__" - ); -} - -void CuptiRBProfilerSession::saveCounterData( - const std::string& /*CounterDataFileName*/, - const std::string& /*CounterDataSBFileName*/) { - /* TBD write binary files for counter data and counter scratch buffer */ -} - -/// Setup counter data -bool CuptiRBProfilerSession::createCounterDataImage() { - CUpti_Profiler_CounterDataImageOptions counterDataImageOptions; - counterDataImageOptions.pCounterDataPrefix = counterDataImagePrefix.data(); - counterDataImageOptions.counterDataPrefixSize = counterDataImagePrefix.size(); - counterDataImageOptions.maxNumRanges = maxRanges_; - counterDataImageOptions.maxNumRangeTreeNodes = maxRanges_; - counterDataImageOptions.maxRangeNameLength = 64; - - // Calculate size of counter data image - CUpti_Profiler_CounterDataImage_CalculateSize_Params calculateSizeParams = { - CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE, nullptr}; - calculateSizeParams.pOptions = &counterDataImageOptions; - calculateSizeParams.sizeofCounterDataImageOptions = - CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; - - CUPTI_CALL( - cuptiProfilerCounterDataImageCalculateSize(&calculateSizeParams)); - counterDataImage.resize(calculateSizeParams.counterDataImageSize); - - // Initialize counter data image - CUpti_Profiler_CounterDataImage_Initialize_Params initializeParams = { - CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE, nullptr}; - initializeParams.sizeofCounterDataImageOptions = - CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; - initializeParams.pOptions = &counterDataImageOptions; - initializeParams.counterDataImageSize = - calculateSizeParams.counterDataImageSize; - initializeParams.pCounterDataImage = counterDataImage.data(); - CUPTI_CALL(cuptiProfilerCounterDataImageInitialize(&initializeParams)); - - // Calculate counter Scratch Buffer size - CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params - scratchBufferSizeParams = { - CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE, nullptr}; - - scratchBufferSizeParams.counterDataImageSize = - calculateSizeParams.counterDataImageSize; - scratchBufferSizeParams.pCounterDataImage = - initializeParams.pCounterDataImage; - CUPTI_CALL(cuptiProfilerCounterDataImageCalculateScratchBufferSize( - &scratchBufferSizeParams)); - - counterDataScratchBuffer.resize( - scratchBufferSizeParams.counterDataScratchBufferSize); - - // Initialize scratch buffer - CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params - initScratchBufferParams = { - CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE, nullptr}; - - initScratchBufferParams.counterDataImageSize = - calculateSizeParams.counterDataImageSize; - - initScratchBufferParams.pCounterDataImage = - initializeParams.pCounterDataImage; - initScratchBufferParams.counterDataScratchBufferSize = - scratchBufferSizeParams.counterDataScratchBufferSize; - initScratchBufferParams.pCounterDataScratchBuffer = - counterDataScratchBuffer.data(); - - CUPTI_CALL(cuptiProfilerCounterDataImageInitializeScratchBuffer( - &initScratchBufferParams)); - - return true; -} - -#elif defined(HAS_CUPTI) - -// Create empty stubs for the API when CUPTI is not present. -CuptiRBProfilerSession::CuptiRBProfilerSession( - const std::vector& metricNames, - int deviceId, - int maxRanges, - int numNestingLevels, - CUcontext cuContext) - : metricNames_(metricNames), - deviceId_(deviceId), - maxRanges_(maxRanges), - numNestingLevels_(numNestingLevels), - cuContext_(cuContext) {} -void CuptiRBProfilerSession::stop() {} -void CuptiRBProfilerSession::enable() {} -void CuptiRBProfilerSession::disable() {} -void CuptiRBProfilerSession::beginPass() {} -bool CuptiRBProfilerSession::endPass() { return true; } -void CuptiRBProfilerSession::flushCounterData() {} -void CuptiRBProfilerSession::pushRange(const std::string& /*rangeName*/) {} -void CuptiRBProfilerSession::popRange() {} -void CuptiRBProfilerSession::asyncStartAndEnable( - CUpti_ProfilerRange /*profilerRange*/, - CUpti_ProfilerReplayMode /*profilerReplayMode*/) {} -void CuptiRBProfilerSession::asyncDisableAndStop() {} -CuptiProfilerResult CuptiRBProfilerSession::evaluateMetrics(bool verbose) { - static CuptiProfilerResult res; - return res; -}; -void CuptiRBProfilerSession::saveCounterData( - const std::string& /*CounterDataFileName*/, - const std::string& /*CounterDataSBFileName*/) {} -void CuptiRBProfilerSession::initCupti() {} -void CuptiRBProfilerSession::deInitCupti() {} -void CuptiRBProfilerSession::staticInit() {} -bool CuptiRBProfilerSession::createCounterDataImage() { return true; } -void CuptiRBProfilerSession::startInternal( - CUpti_ProfilerRange /*profilerRange*/, - CUpti_ProfilerReplayMode /*profilerReplayMode*/) {} -std::vector& CuptiRBProfilerSession::counterAvailabilityImage() { - static std::vector _vec; - return _vec; -} -#endif // HAS_CUPTI_RANGE_PROFILER - -namespace testing { - -void trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid) { -#if HAS_CUPTI_RANGE_PROFILER - __trackCudaCtx(ctx, device_id, cbid); -#endif // HAS_CUPTI_RANGE_PROFILER -} - -void trackCudaKernelLaunch(CUcontext ctx, const char* kernelName) { -#if HAS_CUPTI_RANGE_PROFILER - __trackCudaKernelLaunch(ctx, kernelName); -#endif // HAS_CUPTI_RANGE_PROFILER -} - -} // namespace testing -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.h b/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.h deleted file mode 100644 index 98a0b3ea5f4..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerApi.h +++ /dev/null @@ -1,220 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#ifdef HAS_CUPTI -#include -#include -// Using CUDA 11 and above due to usage of API: cuptiProfilerGetCounterAvailability. -#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000 && CUDART_VERSION < 11040 && CUDA_VERSION >= 11000 -#define HAS_CUPTI_RANGE_PROFILER 1 -#endif // CUDART_VERSION > 10.00 and < 11.04 && CUDA_VERSION >= 11.00 -#endif // HAS_CUPTI - -#if HAS_CUPTI_RANGE_PROFILER -#include -#include -#include -#else -using CUpti_ProfilerRange = enum -{ - CUPTI_AutoRange, - CUPTI_UserRange, -}; - -using CUpti_ProfilerReplayMode = enum -{ - CUPTI_KernelReplay, - CUPTI_UserReplay, -}; -#endif // HAS_CUPTI_RANGE_PROFILER - -#include -#include -#include -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "TraceSpan.h" -#include "CuptiCallbackApi.h" -#include "CuptiNvPerfMetric.h" - -/* Cupti Range based profiler session - * See : https://docs.nvidia.com/cupti/Cupti/r_main.html#r_profiler - */ - -namespace KINETO_NAMESPACE { - -class CuptiRBProfilerSession { - public: - // Initialize and configure CUPTI Profiler counters. - // - Metric names must be provided as string vector. - // - Supported values by CUPTI can be found at - - // https://docs.nvidia.com/cupti/Cupti/r_main.html#r_host_metrics_api - explicit CuptiRBProfilerSession( - const std::vector& metricNames, - int deviceId, - int maxRanges, - int numNestingLevels = 1, - CUcontext cuContext = 0); - - virtual ~CuptiRBProfilerSession() = default; - - // Start profiling session - // This function has to be called from the CPU thread running - // the CUDA context. If this is not the case asyncStartAndEnable() - // can be used - void start( - CUpti_ProfilerRange profilerRange = CUPTI_AutoRange, - CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_KernelReplay) { - startInternal(profilerRange, profilerReplayMode); - } - - // Stop profiling session - virtual void stop(); - - virtual void enable(); - virtual void disable(); - - // Profiler passes - // GPU hardware has limited performance monitoring resources - // the CUPTI profiler may need to run multiple passes to collect - // data for a given range - // If we use kernel replay model the kernels are automatically replayed - // else, you can use the beginPass() and endPass() functions below - // for user to manage the replays - - // starts a profiler pass with given kernels in between - virtual void beginPass(); - - // end a profiler pass with given kernels in between - // returns true if no more passes are required - virtual bool endPass(); - - // flushes the counter data - required if you use user replay - virtual void flushCounterData(); - - // Each pass can contain multiple of ranges - // metrics configured in a pass are collected per each range-stack. - virtual void pushRange(const std::string& rangeName); - virtual void popRange(); - - // utilities for common operations - void startAndEnable(); - void disableAndStop(); - - // Async APIs : these will can be called from another thread - // outside the CUDA context being profiled - void asyncStartAndEnable( - CUpti_ProfilerRange profilerRange = CUPTI_AutoRange, - CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_KernelReplay); - void asyncDisableAndStop(); - - void printMetrics() { - evaluateMetrics(true); - } - - std::unique_ptr getProfilerTraceSpan(); - - virtual CuptiProfilerResult evaluateMetrics(bool verbose = false); - - void saveCounterData( - const std::string& CounterDataFileName, - const std::string& CounterDataSBFileName); - - // This is not thread safe so please only call after - // profiling has stopped - const std::vector& getKernelNames() const { - return kernelNames_; - } - - int deviceId() const { - return deviceId_; - } - - bool profilingActive() const { - return profilingActive_; - } - - static std::set getActiveDevices(); - - static void initCupti(); - - static void deInitCupti(); - - static void staticInit(); - - static void setCounterAvailabilityImage(std::vector img) { - counterAvailabilityImage() = img; - } - protected: - CuptiRBProfilerSession(int deviceId, CUcontext ctx); - - virtual void startInternal( - CUpti_ProfilerRange profilerRange, - CUpti_ProfilerReplayMode profilerReplayMode); - - CUpti_ProfilerRange curRange_ = CUPTI_AutoRange; - CUpti_ProfilerReplayMode curReplay_ = CUPTI_KernelReplay; - - private: - - bool createCounterDataImage(); - - - // log kernel name that used with callbacks - void logKernelName(const char* kernel) { - std::lock_guard lg(kernelNamesMutex_); - kernelNames_.emplace_back(kernel); - } - - std::vector metricNames_; - std::string chipName_; - - uint32_t deviceId_ = 0; - int maxRanges_; - int numNestingLevels_; - CUcontext cuContext_; - - - // data buffers for configuration and counter data collection - std::vector counterDataImagePrefix; - std::vector configImage; - std::vector counterDataImage; - std::vector counterDataScratchBuffer; - - std::chrono::time_point profilerStartTs_; - std::chrono::time_point - profilerInitDoneTs_; - std::chrono::time_point profilerStopTs_; - - std::mutex kernelNamesMutex_; - // raw kernel names (not demangled) - std::vector kernelNames_; - - uint32_t numCallbacks_ = 0; - - static std::vector& counterAvailabilityImage(); - -#if HAS_CUPTI_RANGE_PROFILER - CUpti_Profiler_BeginPass_Params beginPassParams_; - CUpti_Profiler_EndPass_Params endPassParams_; -#endif - - bool initSuccess_ = false; - bool profilingActive_ = false; - - friend void __trackCudaKernelLaunch(CUcontext ctx, const char* kernelName); -}; - -// called directly only in unit tests -namespace testing { - -void trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid); -void trackCudaKernelLaunch(CUcontext ctx, const char* kernelName); - -} // namespace testing - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.cpp b/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.cpp deleted file mode 100644 index 04b1ad0cb3f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include - -#include -#include - -#include -#include - -using namespace std::chrono; - -namespace KINETO_NAMESPACE { - -// number of ranges affect the size of counter data binary used by -// the CUPTI Profiler. these defaults can be tuned -constexpr int KMaxAutoRanges = 1500; // supports 1500 kernels -constexpr int KMaxUserRanges = 10; // enable upto 10 sub regions marked by user - -constexpr char kCuptiProfilerMetricsKey[] = "CUPTI_PROFILER_METRICS"; -constexpr char kCuptiProfilerPerKernelKey[] = "CUPTI_PROFILER_ENABLE_PER_KERNEL"; -constexpr char kCuptiProfilerMaxRangesKey[] = "CUPTI_PROFILER_MAX_RANGES"; - -CuptiRangeProfilerConfig::CuptiRangeProfilerConfig(Config& cfg) - : parent_(&cfg), - cuptiProfilerPerKernel_(false), - cuptiProfilerMaxRanges_(0) {} - -bool CuptiRangeProfilerConfig::handleOption(const std::string& name, std::string& val) { - VLOG(0) << " handling : " << name << " = " << val; - // Cupti Range based Profiler configuration - if (!name.compare(kCuptiProfilerMetricsKey)) { - activitiesCuptiMetrics_ = splitAndTrim(val, ','); - } else if (!name.compare(kCuptiProfilerPerKernelKey)) { - cuptiProfilerPerKernel_ = toBool(val); - } else if (!name.compare(kCuptiProfilerMaxRangesKey)) { - cuptiProfilerMaxRanges_ = toInt64(val); - } else { - return false; - } - return true; -} - -void CuptiRangeProfilerConfig::setDefaults() { - if (activitiesCuptiMetrics_.size() > 0 && cuptiProfilerMaxRanges_ == 0) { - cuptiProfilerMaxRanges_ = - cuptiProfilerPerKernel_ ? KMaxAutoRanges : KMaxUserRanges; - } -} - -void CuptiRangeProfilerConfig::printActivityProfilerConfig(std::ostream& s) const { - if (activitiesCuptiMetrics_.size() > 0) { - s << "Cupti Profiler metrics : " - << fmt::format("{}", fmt::join(activitiesCuptiMetrics_, ", ")) << std::endl; - s << "Cupti Profiler measure per kernel : " - << cuptiProfilerPerKernel_ << std::endl; - s << "Cupti Profiler max ranges : " << cuptiProfilerMaxRanges_ << std::endl; - } -} - -void CuptiRangeProfilerConfig::registerFactory() { - Config::addConfigFactory( - kCuptiProfilerConfigName, - [](Config& cfg) { return new CuptiRangeProfilerConfig(cfg); }); -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.h b/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.h deleted file mode 100644 index 549b8a4e8b4..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/CuptiRangeProfilerConfig.h +++ /dev/null @@ -1,86 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include "Config.h" - -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -constexpr char kCuptiProfilerConfigName[] = "cupti_rb_profiler"; - -class CuptiRangeProfilerConfig : public AbstractConfig { - public: - bool handleOption(const std::string& name, std::string& val) override; - - void validate( - const std::chrono::time_point& - fallbackProfileStartTime) override {} - - static CuptiRangeProfilerConfig& get(const Config& cfg) { - return dynamic_cast(cfg.feature( - kCuptiProfilerConfigName)); - } - - Config& parent() const { - return *parent_; - } - - std::vector activitiesCuptiMetrics() const { - return activitiesCuptiMetrics_; - } - - bool cuptiProfilerPerKernel() const { - return cuptiProfilerPerKernel_; - } - - int64_t cuptiProfilerMaxRanges() const { - return cuptiProfilerMaxRanges_; - } - - void setSignalDefaults() override { - setDefaults(); - } - - void setClientDefaults() override { - setDefaults(); - } - - void printActivityProfilerConfig(std::ostream& s) const override; - - static void registerFactory(); - protected: - AbstractConfig* cloneDerived(AbstractConfig& parent) const override { - CuptiRangeProfilerConfig* clone = new CuptiRangeProfilerConfig(*this); - clone->parent_ = dynamic_cast(&parent); - return clone; - } - - private: - CuptiRangeProfilerConfig() = delete; - explicit CuptiRangeProfilerConfig(Config& parent); - explicit CuptiRangeProfilerConfig( - const CuptiRangeProfilerConfig& other) = default; - - // some defaults will depend on other configuration - void setDefaults(); - - // Associated Config object - Config* parent_; - - // Counter metrics exposed via CUPTI Profiler API - std::vector activitiesCuptiMetrics_; - - // Collect profiler metrics per kernel - autorange made - bool cuptiProfilerPerKernel_{false}; - - // max number of ranges to configure the profiler for. - // this has to be set before hand to reserve space for the output - int64_t cuptiProfilerMaxRanges_ = 0; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/DaemonConfigLoader.h b/plugins/tensorboard-plugins/libkineto/src/DaemonConfigLoader.h deleted file mode 100644 index 9b0ed928636..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/DaemonConfigLoader.h +++ /dev/null @@ -1,27 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include - -namespace KINETO_NAMESPACE { - -class DaemonConfigLoader { - public: - virtual ~DaemonConfigLoader() {} - - // Return the base config from the daemon - virtual std::string readBaseConfig() = 0; - - // Return a configuration string from the daemon, if one has been posted. - virtual std::string readOnDemandConfig(bool events, bool activities) = 0; - - // Returns the number of tracked contexts for this device. The daemon has a - // global view. If an unexpedted error occurs, return -1. - virtual int gpuContextCount(uint32_t device) = 0; - - virtual void setCommunicationFabric(bool enabled) = 0; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/Demangle.cpp b/plugins/tensorboard-plugins/libkineto/src/Demangle.cpp deleted file mode 100644 index f84f0b8ec36..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/Demangle.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "Demangle.h" - -#ifndef _MSC_VER -#include -#endif -#include -#include - -namespace KINETO_NAMESPACE { - -static constexpr int kMaxSymbolSize = 1024; - -std::string demangle(const char* name) { -#ifndef _MSC_VER - if (!name) { - return ""; - } - - if (strlen(name) > kMaxSymbolSize) { - return name; - } - - int status; - size_t len = 0; - char* demangled = abi::__cxa_demangle(name, nullptr, &len, &status); - if (status != 0) { - return name; - } - std::string res(demangled); - // The returned buffer must be freed! - free(demangled); - return res; -#else - // TODO: demangling on Windows - if (!name) { - return ""; - } else { - return name; - } -#endif -} - -std::string demangle(const std::string& name) { - return demangle(name.c_str()); -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/Demangle.h b/plugins/tensorboard-plugins/libkineto/src/Demangle.h deleted file mode 100644 index 6dcf0776f1a..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/Demangle.h +++ /dev/null @@ -1,12 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -namespace KINETO_NAMESPACE { - -std::string demangle(const char* name); -std::string demangle(const std::string& name); - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/EventProfiler.cpp b/plugins/tensorboard-plugins/libkineto/src/EventProfiler.cpp deleted file mode 100644 index dbf27552389..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/EventProfiler.cpp +++ /dev/null @@ -1,635 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "EventProfiler.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "CuptiEventApi.h" -#include "Logger.h" - -using namespace std::chrono; -using std::accumulate; -using std::endl; -using std::map; -using std::ostream; -using std::string; -using std::unique_ptr; -using std::vector; - -namespace KINETO_NAMESPACE { - -static std::mutex& logMutex() { - static std::mutex instance; - return instance; -} - -// --------------------------------------------------------------------- -// class Event -// --------------------------------------------------------------------- - -// Compute domain instance percentiles -PercentileList& Event::percentiles( - PercentileList& pcs, - const SampleSlice& slice) const { - vector instance_values; - instance_values.reserve(instanceCount); - for (int i = 0; i < instanceCount; i++) { - instance_values.push_back(sumInstance(i, slice)); - } - return KINETO_NAMESPACE::percentiles(instance_values, pcs); -} - -// Add up all samples for a given domain instance -int64_t Event::sumInstance(int i, const SampleSlice& slice) const { - auto r = toIdxRange(slice); - auto start = samples_.cbegin(); - std::advance(start, r.first); - auto end = start; - std::advance(end, r.second); - return accumulate(start, end, 0ul, [i](int64_t a, const Sample& b) { - return a + b.second[i]; - }); -} - -// Add up all samples across all domain instances -int64_t Event::sumAll(const SampleSlice& slice) const { - int64_t res = 0; - for (int i = 0; i < instanceCount; i++) { - res += sumInstance(i, slice); - } - return res; -} - -// Print raw sample values for all domains -void Event::printSamples(ostream& s, CUdevice device) const { - // Don't mess up output with interleaved lines - // Probably OK to reuse logMutex() here since this is - // used for debugging, but need to keep an eye on it. - std::lock_guard lock(logMutex()); - s << "Device " << device << " " << name << ":" << endl; - for (const auto& sample : samples_) { - const auto& vals = sample.second; - for (int64_t val : vals) { - s << val << " "; - } - s << endl; - } -} - -// --------------------------------------------------------------------- -// class Metric -// --------------------------------------------------------------------- -Metric::Metric( - string name, - CUpti_MetricID id, - vector events, - CUpti_MetricEvaluationMode eval_mode, - CuptiMetricApi& cupti_metrics) - : name(std::move(name)), - id_(id), - events_(std::move(events)), - evalMode_(eval_mode), - cuptiMetrics_(cupti_metrics), - valueKind_(cuptiMetrics_.valueKind(id)) {} - -// Return per-SM vector as well as total -struct Metric::CalculatedValues Metric::calculate( - map& event_map, - nanoseconds sample_duration, - const SampleSlice& slice) { - vector metric_values; - vector ev_values; - ev_values.reserve(events_.size()); - if (evalMode_ & CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE) { - int instance_count = instanceCount(event_map); - metric_values.reserve(instance_count); - for (int i = 0; i < instance_count; i++) { - ev_values.clear(); - for (CUpti_EventID event_id : events_) { - ev_values.push_back(event_map[event_id].sumInstance(i, slice)); - } - metric_values.push_back(cuptiMetrics_.calculate( - id_, valueKind_, events_, ev_values, sample_duration.count())); - } - } - - // FIXME: Check assumption that all instances are profiled - ev_values.clear(); - for (CUpti_EventID event_id : events_) { - ev_values.push_back(event_map[event_id].sumAll(slice)); - } - SampleValue total = cuptiMetrics_.calculate( - id_, valueKind_, events_, ev_values, sample_duration.count()); - if (evalMode_ & CUPTI_METRIC_EVALUATION_MODE_AGGREGATE) { - metric_values.push_back(total); - } - return {metric_values, std::move(total)}; -} - -void Metric::printDescription(ostream& s) const { - s << fmt::format("{} ({})", name, fmt::join(events_, ",")) << endl; -} - -// --------------------------------------------------------------------- -// class EventGroupSet -// --------------------------------------------------------------------- - -// Each domain has a set of counters. -// Some counters in a domain can be collected simultaneously in a "group" -// Counters from different domains can also be collected at the same time -// Therefore we have a "set of groups", or group set, with counters that -// can all be collected at once. -EventGroupSet::EventGroupSet( - CUpti_EventGroupSet& set, - map& events, - CuptiEventApi& cupti) - : set_(set), events_(events), cuptiEvents_(cupti), enabled_(false) { - for (int g = 0; g < set.numEventGroups; g++) { - CUpti_EventGroup grp = set.eventGroups[g]; - // Profile all domain instances - cuptiEvents_.enablePerInstance(grp); - uint32_t instance_count = cuptiEvents_.instanceCount(grp); - for (const auto& id : cuptiEvents_.eventsInGroup(grp)) { - VLOG(0) << "Instance count for " << id << ":" << instance_count; - events_[id].instanceCount = instance_count; - } - } -} - -EventGroupSet::~EventGroupSet() { - // Disable EventGroupSet in Cupti. - if (enabled_) { - setEnabled(false); - } -} - -// Enable or disable this group set -void EventGroupSet::setEnabled(bool enabled) { - if (enabled && !enabled_) { - cuptiEvents_.enableGroupSet(set_); - } else if (!enabled && enabled_) { - cuptiEvents_.disableGroupSet(set_); - } - enabled_ = enabled; -} - -// Collect counter values for each counter in group set -void EventGroupSet::collectSample() { - auto timestamp = system_clock::now(); - for (int g = 0; g < set_.numEventGroups; g++) { - CUpti_EventGroup grp = set_.eventGroups[g]; - for (const auto& id : cuptiEvents_.eventsInGroup(grp)) { - Event& ev = events_[id]; - vector vals(ev.instanceCount); - // FIXME: Use cuptiEventGroupReadAllEvents - cuptiEvents_.readEvent(grp, id, vals); - - if (VLOG_IS_ON(0)) { - for (int64_t v : vals) { - if (v == CUPTI_EVENT_OVERFLOW) { - LOG(WARNING) << "Counter overflow detected " - << "- decrease sample period!" << endl; - } - } - } - - ev.addSample(timestamp, vals); - } - } - - if (VLOG_IS_ON(1)) { - auto t2 = system_clock::now(); - VLOG(1) << "Device " << cuptiEvents_.device() << " Sample (us): " - << duration_cast(t2 - timestamp).count(); - } -} - -// Print names of events in this group set, ordered by group -void EventGroupSet::printDescription(ostream& s) const { - for (int g = 0; g < set_.numEventGroups; g++) { - s << " Events in group " << g << ": "; - for (const auto& id : cuptiEvents_.eventsInGroup(set_.eventGroups[g])) { - s << id << " (" << events_[id].name << ") "; - } - s << endl; - } -} - -// --------------------------------------------------------------------- -// class EventProfiler -// --------------------------------------------------------------------- - -// Find nearest factor of a number by linear search, -// starting at hi and lo - hi searches up and lo searches down -static int nearestFactor(int hi, int lo, int number) { - return number % hi == 0 - ? hi - : number % lo == 0 ? lo : nearestFactor(hi + 1, lo - 1, number); -} - -static int nearestFactor(int count, int max) { - return nearestFactor(count, count, max); -} - -void EventProfiler::initEvents(const std::set& eventNames) { - events_.clear(); - // Build event map - for (const auto& name : eventNames) { - events_.emplace(cuptiEvents_->eventId(name), name); - } -} - -void EventProfiler::initMetrics(const std::set& metricNames) { - metrics_.clear(); - // Add events from metrics - metrics_.reserve(metricNames.size()); - for (const auto& metric_name : metricNames) { - CUpti_MetricID metric_id = cuptiMetrics_->idFromName(metric_name); - if (metric_id == ~0) { - continue; - } - - const auto& events = cuptiMetrics_->events(metric_id); - vector event_ids; - event_ids.reserve(events.size()); - for (const auto& pair : events) { - CUpti_EventID id = pair.first; - const string& event_name = pair.second; - if (event_name.empty()) { - // For unnamed events, use metric name and event id - // FIXME: For subsequent metrics using the same event, - // this will be confusing - events_.emplace(id, metric_name + "_" + event_name); - } else { - events_.emplace(id, event_name); - } - event_ids.push_back(id); - } - metrics_.emplace_back( - metric_name, - metric_id, - event_ids, - cuptiMetrics_->evaluationMode(metric_id), - *cuptiMetrics_); - } -} - -bool EventProfiler::initEventGroups() { - sets_.clear(); - if (eventGroupSets_) { - cuptiEvents_->destroyGroupSets(eventGroupSets_); - eventGroupSets_ = nullptr; - } - if (events_.empty()) { - return true; - } - - // Determine sets of groups to be collected - vector ids; - ids.reserve(events_.size()); - for (const auto& ev : events_) { - ids.push_back(ev.first); - } - eventGroupSets_ = cuptiEvents_->createGroupSets(ids); - VLOG(0) << "Number of group sets: " << eventGroupSets_->numSets; - for (int i = 0; i < eventGroupSets_->numSets; i++) { - sets_.push_back( - EventGroupSet(eventGroupSets_->sets[i], events_, *cuptiEvents_)); - } - return !sets_.empty(); -} - -static unique_ptr alignAndValidateConfigs( - Config& base, - Config* onDemand) { - auto now = system_clock::now(); - if (!onDemand || - now > - (onDemand->eventProfilerOnDemandStartTime() + - onDemand->eventProfilerOnDemandDuration())) { - base.validate(now); - return base.clone(); - } - - auto res = base.clone(); - res->addEvents(onDemand->eventNames()); - res->addMetrics(onDemand->metricNames()); - - int sample_period = - std::min(base.samplePeriod().count(), onDemand->samplePeriod().count()); - if (sample_period < base.samplePeriod().count() && - (base.samplePeriod().count() % sample_period) != 0) { - sample_period = nearestFactor(sample_period, base.samplePeriod().count()); - LOG(WARNING) - << "On-demand sample period must be a factor of base sample period. " - << "Adjusting from " << onDemand->samplePeriod().count() << "ms to " - << sample_period << "ms."; - } - base.setSamplePeriod(milliseconds(sample_period)); - base.validate(now); - res->setSamplePeriod(base.samplePeriod()); - res->setMultiplexPeriod(base.multiplexPeriod()); - res->validate(now); - onDemand->setSamplePeriod(base.samplePeriod()); - onDemand->setMultiplexPeriod(base.multiplexPeriod()); - onDemand->validate(now); - - return res; -} - -static milliseconds minReportPeriod(const Config& config, int num_sets) { - return config.multiplexPeriod() * num_sets; -} - -static bool canSupportReportPeriod(const Config& config, int num_sets) { - // Can we get through the groups an even number per report period? - milliseconds min_report_period = minReportPeriod(config, num_sets); - return (config.reportPeriod().count() % min_report_period.count()) == 0; -} - -static int completeSamplesPerReport(const Config& config, int num_sets) { - if (num_sets <= 1) { - return config.reportPeriod() / config.samplePeriod(); - } - // Numnber of complete sample collections in the report period - // E.g. if report period is 10000ms, sample period 500ms, - // multiplex period 2000ms and num_sets is 5 then # of complete samples is - // (2000ms / 500ms) * (10000ms / 2000ms / 5) = 4 * 1 = 4 - int samples_per_multiplex_period = - config.multiplexPeriod() / config.samplePeriod(); - int multiplex_periods_per_report = - config.reportPeriod() / config.multiplexPeriod(); - return (multiplex_periods_per_report / num_sets) * - samples_per_multiplex_period; -} - -static bool canSupportSamplesPerReport(const Config& config, int num_sets) { - // Can samples per report can be honored with an exact *full* set of samples? - // We don't support partial samples at this point. - int full_samples_per_report = completeSamplesPerReport(config, num_sets); - return (full_samples_per_report % config.samplesPerReport()) == 0; -} - -static void adjustConfig(Config& config, int num_sets) { - // Don't change sample period and multiplex period here, since that can - // cause overflows and perf degradation. Report period and samples per - // report is OK to change (with warning). - if (!canSupportReportPeriod(config, num_sets)) { - milliseconds min_report_period = minReportPeriod(config, num_sets); - LOG(WARNING) << "Report period must be a multiple of " - << min_report_period.count() << "ms (" << num_sets - << " event sets * " << config.multiplexPeriod().count() - << "ms multiplex period), in order to get complete samples."; - auto new_report_period = - Config::alignUp(config.reportPeriod(), min_report_period); - double sf = - ((double)new_report_period.count()) / config.reportPeriod().count(); - int new_samples_per_report = std::round(config.samplesPerReport() * sf); - LOG(WARNING) << "Adjusting report period from " - << config.reportPeriod().count() << "ms to " - << new_report_period.count() << "ms"; - if (new_samples_per_report != config.samplesPerReport()) { - LOG(WARNING) << "Adjusting samples per report from " - << config.samplesPerReport() << " to " - << new_samples_per_report; - } - config.setReportPeriod(new_report_period); - config.setSamplesPerReport(new_samples_per_report); - } - // Ensure that samples per report can be honored with - // an exact *full* set of samples. Don't support partial - // samples at this point. - if (!canSupportSamplesPerReport(config, num_sets)) { - int full_samples_per_report = completeSamplesPerReport(config, num_sets); - int adjusted_count = - nearestFactor(config.samplesPerReport(), full_samples_per_report); - LOG(WARNING) - << "Samples per report must be such that an even number of " - << "complete samples can be aggregated in each report period. Adjusting" - << " from " << config.samplesPerReport() << " to " << adjusted_count - << " (complete sample count is " << full_samples_per_report << ")"; - config.setSamplesPerReport(adjusted_count); - } -} - -// Prepare profiler -EventProfiler::EventProfiler( - std::unique_ptr cupti_events, - std::unique_ptr cupti_metrics, - vector>& loggers, - vector>& onDemandLoggers) - : cuptiEvents_(std::move(cupti_events)), - cuptiMetrics_(std::move(cupti_metrics)), - loggers_(loggers), - onDemandLoggers_(onDemandLoggers) {} - -void EventProfiler::reportSamples() { - dispatchSamples(*config_, loggers_, baseSamples_); - baseSamples_ += completeSamplesPerReport(*config_, sets_.size()); -} - -void EventProfiler::reportOnDemandSamples() { - dispatchSamples(*onDemandConfig_, onDemandLoggers_, onDemandSamples_); - onDemandSamples_ += completeSamplesPerReport(*onDemandConfig_, sets_.size()); -} - -EventProfiler::~EventProfiler() { - if (eventGroupSets_) { - for (auto& set : sets_) { - set.setEnabled(false); - } - cuptiEvents_->destroyGroupSets(eventGroupSets_); - } - VLOG(0) << "Stopped event profiler for device " << device(); -} - -void EventProfiler::updateLoggers(Config& config, Config* on_demand_config) { - // Update loggers. - for (auto& logger : loggers_) { - std::lock_guard lock(logMutex()); - logger->update(config); - } - - if (on_demand_config) { - // Update onDemand loggers. - for (auto& logger : onDemandLoggers_) { - std::lock_guard lock(logMutex()); - logger->update(*on_demand_config); - } - } -} - -bool EventProfiler::applyConfig(const Config& config) { - // Initialize events, metrics, and event group sets. - // TODO: Send warnings / errors back to dyno for onDemand config - try { - if (!initEventsAndMetrics(config)) { - return false; - } - } catch (const std::exception& ex) { - LOG(WARNING) << "Failed to apply config (" << ex.what() << ")"; - return false; - } - - return true; -} - -bool EventProfiler::initEventsAndMetrics(const Config& config) { - initEvents(config.eventNames()); - initMetrics(config.metricNames()); - // We now have the total list of events to collect - // They need to be organized into groups for multiplexing - if (!initEventGroups()) { - LOG(WARNING) << "No events/metrics initialized successfully"; - return false; - } - - if (VLOG_IS_ON(1)) { - printMetrics(LIBKINETO_DBG_STREAM); - printSets(LIBKINETO_DBG_STREAM); - } - return true; -} - -void EventProfiler::printSets(ostream& s) const { - for (int i = 0; i < sets_.size(); i++) { - s << "Set " << i << endl; - sets_[i].printDescription(s); - } -} - -void EventProfiler::printMetrics(ostream& s) const { - s << "Metrics:" << endl; - for (const Metric& m : metrics_) { - m.printDescription(s); - } -} - -void EventProfiler::printAllSamples(ostream& s, CUdevice device) const { - for (const auto& pair : events_) { - const Event& ev = pair.second; - ev.printSamples(s, device); - } -} - -void EventProfiler::enableNextCounterSet() { - if (sets_.size() > 1) { - auto t1 = system_clock::now(); - - VLOG(1) << "Disabling set " << curEnabledSet_; - sets_[curEnabledSet_].setEnabled(false); - curEnabledSet_ = (curEnabledSet_ + 1) % sets_.size(); - VLOG(1) << "Enabling set " << curEnabledSet_; - sets_[curEnabledSet_].setEnabled(true); - - if (VLOG_IS_ON(1)) { - auto t2 = system_clock::now(); - VLOG(1) << "Switch (us): " - << duration_cast(t2 - t1).count(); - } - } -} - -// Notify listeners of collected samples -void EventProfiler::dispatchSamples( - const Config& config, - const vector>& loggers, - int sample_offset) { - Sample sample(events_.size() + metrics_.size()); - // Normalize values to per second - auto delta = config.reportPeriod() / config.samplesPerReport(); - double sf = 1000.0 * sets_.size() / delta.count(); - for (int i = 0; i < config.samplesPerReport(); i++) { - sample.stats.clear(); - sample.deltaMsec = (delta * i).count(); - SampleSlice slice = {sample_offset, i, config.samplesPerReport()}; - VLOG(1) << "Slice: " << sample_offset << ", " << i << ", " - << config.samplesPerReport(); - for (const auto& pair : events_) { - const Event& ev = pair.second; - int64_t total = std::round(sf * ev.sumAll(slice)); - PercentileList pcs = initPercentiles(config.percentiles()); - normalize(ev.percentiles(pcs, slice), sf); - sample.stats.push_back({ev.name, std::move(pcs), SampleValue(total)}); - } - - for (auto& m : metrics_) { - // calculate returns a pair of per-SM vector and a total - auto vals = m.calculate(events_, delta, slice); - PercentileList pcs = initPercentiles(config.percentiles()); - sample.stats.push_back( - {m.name, std::move(percentiles(vals.perInstance, pcs)), vals.total}); - } - - for (auto& logger : loggers) { - std::lock_guard lock(logMutex()); - logger->handleSample(device(), sample, config.ipcFabricEnabled()); - } - } - - if (VLOG_IS_ON(2)) { - printAllSamples(LIBKINETO_DBG_STREAM, device()); - } -} - -void EventProfiler::configure(Config& config, Config* onDemandConfig) { - if (!sets_.empty()) { - sets_[curEnabledSet_].setEnabled(false); - clearSamples(); - } - - config_ = config.clone(); - onDemandConfig_ = onDemandConfig ? onDemandConfig->clone() : nullptr; - mergedConfig_ = alignAndValidateConfigs(*config_, onDemandConfig_.get()); - if (!applyConfig(*mergedConfig_)) { - LOG(WARNING) << "Failed to apply config!"; - mergedConfig_ = config_->clone(); - applyConfig(*config_); - } - if (!sets_.empty()) { - // Make timing adjustments based on multiplexing requirements. - adjustConfig(*config_, sets_.size()); - if (onDemandConfig_) { - int duration = onDemandConfig_->eventProfilerOnDemandDuration().count(); - LOG(INFO) << "On demand profiler activated for " << duration << " secs"; - adjustConfig(*onDemandConfig_, sets_.size()); - } - // If events or metrics were added or removed, need to tell loggers - updateLoggers(*config_, onDemandConfig_.get()); - } - - curEnabledSet_ = 0; - if (!sets_.empty()) { - sets_[0].setEnabled(true); - } else { - VLOG(0) << "No counters profiled!"; - } - - baseSamples_ = 0; - onDemandSamples_ = 0; -} - -void EventProfiler::collectSample() { - if (sets_.empty()) { - return; - } - sets_[curEnabledSet_].collectSample(); - if (VLOG_IS_ON(1)) { - printAllSamples(LIBKINETO_DBG_STREAM, device()); - } -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/EventProfiler.h b/plugins/tensorboard-plugins/libkineto/src/EventProfiler.h deleted file mode 100644 index fafd5b9bb83..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/EventProfiler.h +++ /dev/null @@ -1,341 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "Config.h" -#include "CuptiEventApi.h" -#include "CuptiMetricApi.h" -#include "SampleListener.h" - -namespace KINETO_NAMESPACE { - -// Helper function for computing percentiles (nearest-rank). -// Modifies the input. -template -inline PercentileList& percentiles(std::vector values, PercentileList& pcs) { - auto size = values.size(); - for (auto& x : pcs) { - int idx = std::min(size - 1, (x.first * size) / 100); - std::nth_element(values.begin(), values.begin() + idx, values.end()); - x.second = SampleValue(values[idx]); - } - return pcs; -} - -// Helper function for normalizing a percentile list -// Modifies the input -inline PercentileList& normalize(PercentileList& pcs, double sf) { - for (auto& pc : pcs) { - pc.second *= sf; - } - return pcs; -} - -// A slice of the sample buffer -struct SampleSlice { - // Start offset (samples) - int offset; - // Slice number - int index; - // Out of this many - int count; -}; - -// A sampled event -class Event { - public: - /* implicit */ Event(std::string name) : name(std::move(name)) {} - /* implicit */ Event(const char* name) : name(name) {} - Event() : name("INVALID") {} - - Event(const Event&) = delete; - Event& operator=(const Event&) = delete; - Event(Event&&) = default; - Event& operator=(Event&&) = default; - - void addSample( - std::chrono::time_point timestamp, - const std::vector& values) { - assert(values.size() == instanceCount); - samples_.emplace_back(timestamp, values); - } - - // Sum samples for a single domain instance - int64_t sumInstance(int i, const SampleSlice& slice) const; - - // Sum all samples across all domain instances - int64_t sumAll(const SampleSlice& slice) const; - - // Create list of percentiles - PercentileList& percentiles(PercentileList& pcs, const SampleSlice& slice) - const; - - void eraseSamples(int count) { - auto end = samples_.begin(); - std::advance(end, count); - samples_.erase(samples_.begin(), end); - } - - void clearSamples() { - samples_.clear(); - } - - int sampleCount() { - return samples_.size(); - } - - void printSamples(std::ostream& s, CUdevice device) const; - - // Event name (see nvprof --query-events) - std::string name; - - // Number of domain instances for this event, e.g. number of SMs - int instanceCount = 0; - - private: - std::pair toIdxRange(const SampleSlice& slice) const { - int size = (samples_.size() - slice.offset) / slice.count; - return std::make_pair(slice.offset + (slice.index * size), size); - } - - // List of collected samples, where each sample has values for - // one or more domain instances - using Sample = std::pair< - std::chrono::time_point, - std::vector>; - std::list samples_; -}; - -class Metric { - public: - Metric( - std::string name, - CUpti_MetricID id, - std::vector events, - CUpti_MetricEvaluationMode eval_mode, - CuptiMetricApi& cupti_metrics); - - struct CalculatedValues { - std::vector perInstance; - SampleValue total; - }; - - struct CalculatedValues calculate( - std::map& events, - std::chrono::nanoseconds sample_duration, - const SampleSlice& slice); - - int instanceCount(std::map& events) { - return events[events_[0]].instanceCount; - } - - void printDescription(std::ostream& s) const; - - std::string name; - - private: - CUpti_MetricID id_; - std::vector events_; - CUpti_MetricEvaluationMode evalMode_; - // Calls to CUPTI is encapsulated behind this interface - CuptiMetricApi& cuptiMetrics_; - CUpti_MetricValueKind valueKind_; -}; - -/** - * A set of event groups. - * Holds all the events that may be collected in a single pass. - * A group contains one or more counters for a single domain. - * A group set contains zero or one groups per domain. - */ -class EventGroupSet { - public: - EventGroupSet( - CUpti_EventGroupSet& set, - std::map& events, - CuptiEventApi& cupti); - ~EventGroupSet(); - - EventGroupSet(const EventGroupSet&) = delete; - EventGroupSet& operator=(const EventGroupSet&) = delete; - EventGroupSet(EventGroupSet&&) = default; - EventGroupSet& operator=(EventGroupSet&&) = delete; - - // Number of groups = number of domains profiled - int groupCount() const { - return set_.numEventGroups; - } - - void setEnabled(bool enabled); - // Take a sample of counters in this group set - void collectSample(); - void printDescription(std::ostream& s) const; - - private: - CUpti_EventGroupSet& set_; - std::map& events_; - // Calls to CUPTI is encapsulated behind this interface - CuptiEventApi& cuptiEvents_; - bool enabled_; -}; - -// The sampler -class EventProfiler { - public: - explicit EventProfiler( - std::unique_ptr cupti_events, - std::unique_ptr cupti_metrics, - std::vector>& loggers, - std::vector>& onDemandLoggers); - EventProfiler(const EventProfiler&) = delete; - EventProfiler& operator=(const EventProfiler&) = delete; - ~EventProfiler(); - - void configure(Config& config, Config* onDemandConfig); - - bool isOnDemandActive() { - return !!onDemandConfig_; - } - - // Print the counter sets. Multiple sets will be multiplexed. - void printSets(std::ostream& s) const; - - // Print metrics descriptions - void printMetrics(std::ostream& s) const; - - bool enableForDevice(Config& cfg); - - CUdevice device() { - return cuptiEvents_->device(); - } - - bool setContinuousMode() { - return cuptiEvents_->setContinuousMode(); - } - - std::chrono::milliseconds samplePeriod() { - return mergedConfig_->samplePeriod(); - } - - std::chrono::milliseconds multiplexPeriod() { - return mergedConfig_->multiplexPeriod(); - } - - std::chrono::milliseconds reportPeriod() { - return config_->reportPeriod(); - } - - std::chrono::milliseconds onDemandReportPeriod() { - return onDemandConfig_->reportPeriod(); - } - - // Read values of currently running counters. - void collectSample(); - - void reportSamples(); - void reportOnDemandSamples(); - - bool enabled() { - return sets_.size() > 0; - } - - bool multiplexEnabled() { - return sets_.size() > 1; - } - - // Multiplex counters. - void enableNextCounterSet(); - - void eraseReportedSamples() { - int erase_count = baseSamples_; - if (onDemandConfig_ && - onDemandConfig_->eventProfilerOnDemandDuration().count() > 0) { - erase_count = std::min(baseSamples_, onDemandSamples_); - } - eraseSamples(erase_count); - baseSamples_ -= erase_count; - onDemandSamples_ -= erase_count; - } - - void clearSamples() { - for (auto& pair : events_) { - pair.second.clearSamples(); - } - baseSamples_ = 0; - onDemandSamples_ = 0; - } - - private: - // Functions to initialize profiler based on Config settings. - bool applyConfig(const Config& config); - bool initEventsAndMetrics(const Config& config); - void initEvents(const std::set& eventNames); - void initMetrics(const std::set& metricNames); - bool initEventGroups(); - - PercentileList initPercentiles(const std::vector& percentiles) { - PercentileList res; - res.reserve(percentiles.size()); - for (int p : percentiles) { - res.emplace_back(p, SampleValue(0)); - } - return res; - } - - // Notify listeners of collected samples - void dispatchSamples( - const Config& config, - const std::vector>& loggers, - int report_nr); - - void eraseSamples(int count) { - for (auto& pair : events_) { - pair.second.eraseSamples(count); - } - } - - void updateLoggers(Config& config, Config* on_demand_config); - - // Print all collected samples since last clear. - void printAllSamples(std::ostream& s, CUdevice device) const; - - // Calls to CUPTI is encapsulated behind these interfaces - std::unique_ptr cuptiEvents_; - std::unique_ptr cuptiMetrics_; - // The CUpti API reports event IDs, we must map them to our event objects - std::map events_; - // List of metrics - std::vector metrics_; - // The countert sets needed to collect all counters - std::vector sets_; - // The event group set object returned by Cupti. - // Saved s.t. we can call cuptiEventGroupSetsDestroy to free memory when - // the object is no longer needed. - CUpti_EventGroupSets* eventGroupSets_ = nullptr; - // Current multiplexed counter set - int curEnabledSet_{0}; - - std::unique_ptr config_; - std::unique_ptr onDemandConfig_; - std::unique_ptr mergedConfig_; - int baseSamples_{0}; - int onDemandSamples_{0}; - - // Shared between profiler threads - // Vectors are read-only but calling loggers require lock - const std::vector>& loggers_; - const std::vector>& onDemandLoggers_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/EventProfilerController.cpp b/plugins/tensorboard-plugins/libkineto/src/EventProfilerController.cpp deleted file mode 100644 index 0427cc7a90c..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/EventProfilerController.cpp +++ /dev/null @@ -1,423 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "EventProfilerController.h" - -#include -#include -#include - -#include "ConfigLoader.h" -#include "CuptiEventApi.h" -#include "CuptiMetricApi.h" -#include "EventProfiler.h" -#include "output_csv.h" - -#include "Logger.h" -#include "ThreadUtil.h" - -using namespace std::chrono; -using std::unique_ptr; -using std::vector; - -namespace KINETO_NAMESPACE { - -namespace { - -vector(const Config&)>>& -loggerFactories() { - static vector(const Config&)>> - factories; - return factories; -} - -vector(const Config&)>>& -onDemandLoggerFactories() { - static vector(const Config&)>> - factories; - return factories; -} - -vector> makeLoggers(const Config& config) { - vector> loggers; - for (const auto& factory : loggerFactories()) { - loggers.push_back(factory(config)); - } - loggers.push_back(std::make_unique()); - loggers.push_back(std::make_unique()); - return loggers; -} - -vector> makeOnDemandLoggers( - const Config& config) { - vector> loggers; - for (const auto& factory : onDemandLoggerFactories()) { - loggers.push_back(factory(config)); - } - loggers.push_back(std::make_unique()); - return loggers; -} - -vector>& loggers(const Config& config) { - static auto res = makeLoggers(config); - return res; -} - -vector>& onDemandLoggers( - const Config& config) { - static auto res = makeOnDemandLoggers(config); - return res; -} - -} // anon namespace - -// Keep an eye on profiling threads. -// We've observed deadlocks in Cuda11 in libcuda / libcupti.. -namespace detail { - -class HeartbeatMonitor { - - public: - ~HeartbeatMonitor() { - stopMonitoring(); - } - - static HeartbeatMonitor& instance() { - static HeartbeatMonitor monitor; - return monitor; - } - - void profilerHeartbeat() { - int32_t tid = systemThreadId(); - std::lock_guard lock(mutex_); - profilerAliveMap_[tid]++; - } - - void setPeriod(seconds period) { - { - std::lock_guard lock(mutex_); - if (period_ == period) { - return; - } - period_ = period; - } - if (period == seconds(0)) { - stopMonitoring(); - } else { - startMonitoring(); - } - } - - private: - HeartbeatMonitor() = default; - - void monitorLoop() { - std::unique_lock lock(mutex_); - while(!stopMonitor_) { - auto cv_status = condVar_.wait_for(lock, seconds(period_)); - // Don't perform check on spurious wakeup or on notify - if (cv_status == std::cv_status::timeout) { - for (auto& pair : profilerAliveMap_) { - int32_t tid = pair.first; - int& i = pair.second; - if (i == 0) { - LOG(ERROR) << "Thread " << tid << " appears stuck!"; - } - i = 0; - } - } - } - } - - void startMonitoring() { - if (!monitorThread_) { - VLOG(0) << "Starting monitoring thread"; - stopMonitor_ = false; - monitorThread_ = std::make_unique( - &HeartbeatMonitor::monitorLoop, this); - } - } - - void stopMonitoring() { - if (monitorThread_) { - VLOG(0) << "Stopping monitoring thread"; - stopMonitor_ = true; - condVar_.notify_one(); - monitorThread_->join(); - monitorThread_ = nullptr; - VLOG(0) << "Monitoring thread terminated"; - } - } - - std::map profilerAliveMap_; - std::unique_ptr monitorThread_; - std::mutex mutex_; - std::condition_variable condVar_; - std::atomic_bool stopMonitor_{false}; - seconds period_{0}; -}; - -} // namespace detail - -namespace { -// Profiler map singleton -std::map>& profilerMap() { - static std::map> instance; - return instance; -} - -void reportLateSample( - int sleepMs, - int sampleMs, - int reportMs, - int reprogramMs) { - LOG_EVERY_N(WARNING, 10) << "Lost sample due to delays (ms): " << sleepMs - << ", " << sampleMs << ", " << reportMs << ", " - << reprogramMs; -} - -void configureHeartbeatMonitor( - detail::HeartbeatMonitor& monitor, const Config& base, const Config* onDemand) { - seconds base_period = - base.eventProfilerHeartbeatMonitorPeriod(); - seconds on_demand_period = !onDemand ? seconds(0) : - onDemand->eventProfilerHeartbeatMonitorPeriod(); - monitor.setPeriod( - on_demand_period > seconds(0) ? on_demand_period : base_period); -} - -} // anon namespace - -void EventProfilerController::addLoggerFactory( - std::function(const Config&)> factory) { - loggerFactories().push_back(factory); -} - -void EventProfilerController::addOnDemandLoggerFactory( - std::function(const Config&)> factory) { - onDemandLoggerFactories().push_back(factory); -} - -EventProfilerController::EventProfilerController( - CUcontext context, - ConfigLoader& configLoader, - detail::HeartbeatMonitor& heartbeatMonitor) - : configLoader_(configLoader), heartbeatMonitor_(heartbeatMonitor) { - auto cupti_events = std::make_unique(context); - auto cupti_metrics = - std::make_unique(cupti_events->device()); - configLoader_.addHandler( - ConfigLoader::ConfigKind::EventProfiler, this); - auto config = configLoader.getConfigCopy(); - profiler_ = std::make_unique( - std::move(cupti_events), - std::move(cupti_metrics), - loggers(*config), - onDemandLoggers(*config)); - profilerThread_ = std::make_unique( - &EventProfilerController::profilerLoop, this); -} - -EventProfilerController::~EventProfilerController() { - if (profilerThread_) { - // signaling termination of the profiler loop - stopRunloop_ = true; - profilerThread_->join(); - } - configLoader_.removeHandler( - ConfigLoader::ConfigKind::EventProfiler, this); - VLOG(0) << "Stopped event profiler"; -} - -// Must be called under lock -void EventProfilerController::start(CUcontext ctx, ConfigLoader& configLoader) { - profilerMap()[ctx] = unique_ptr( - new EventProfilerController( - ctx, configLoader, detail::HeartbeatMonitor::instance())); -} - -// Must be called under lock -void EventProfilerController::stop(CUcontext ctx) { - profilerMap()[ctx] = nullptr; -} - -bool EventProfilerController::canAcceptConfig() { - std::lock_guard guard(mutex_); - return !newOnDemandConfig_; -} - -void EventProfilerController::acceptConfig(const Config& config) { - if (config.eventProfilerOnDemandDuration().count() == 0) { - // Ignore - not for this profiler - return; - } - std::lock_guard guard(mutex_); - if (newOnDemandConfig_) { - LOG(ERROR) << "On demand request already queued - ignoring new request"; - return; - } - newOnDemandConfig_ = config.clone(); - LOG(INFO) << "Received new on-demand config"; -} - -bool EventProfilerController::enableForDevice(Config& cfg) { - // FIXME: Use device unique id! - if (!cfg.eventProfilerEnabledForDevice(profiler_->device())) { - return false; - } - // context count includes the new context - int instances = configLoader_.contextCountForGpu(profiler_->device()); - VLOG(0) << "Device context count: " << instances; - return instances >= 0 && instances <= cfg.maxEventProfilersPerGpu(); -} - -void EventProfilerController::profilerLoop() { - // We limit the number of profilers that can exist per GPU - auto config = configLoader_.getConfigCopy(); - if (!enableForDevice(*config)) { - VLOG(0) << "Not starting EventProfiler - profilers for GPU " - << profiler_->device() << " exceeds profilers per GPU limit (" - << config->maxEventProfilersPerGpu() << ")"; - return; - } - - if (!profiler_->setContinuousMode()) { - VLOG(0) << "Continuous mode not supported for GPU " - << profiler_->device() << ". Not starting Event Profiler."; - return; - } - - VLOG(0) << "Starting Event Profiler for GPU " << profiler_->device(); - setThreadName("CUPTI Event Profiler"); - - time_point next_sample_time; - time_point next_report_time; - time_point next_on_demand_report_time; - time_point next_multiplex_time; - std::unique_ptr on_demand_config = nullptr; - bool reconfigure = true; - bool restart = true; - int report_count = 0; - int on_demand_report_count = 0; - while (!stopRunloop_) { - heartbeatMonitor_.profilerHeartbeat(); - if (configLoader_.hasNewConfig(*config)) { - config = configLoader_.getConfigCopy(); - VLOG(0) << "Base config changed"; - report_count = 0; - reconfigure = true; - } - - auto now = system_clock::now(); - if (on_demand_config && - now > (on_demand_config->eventProfilerOnDemandStartTime() + - on_demand_config->eventProfilerOnDemandDuration())) { - on_demand_config = nullptr; - LOG(INFO) << "On-demand profiling complete"; - reconfigure = true; - } - - if (!profiler_->isOnDemandActive()) { - std::lock_guard lock(mutex_); - if (newOnDemandConfig_) { - VLOG(0) << "Received on-demand config, reconfiguring"; - on_demand_config = std::move(newOnDemandConfig_); - reconfigure = true; - on_demand_report_count = 0; - } - } - - if (reconfigure) { - try { - profiler_->configure(*config, on_demand_config.get()); - } catch (const std::exception& ex) { - LOG(ERROR) << "Encountered error while configuring event profiler: " - << ex.what(); - // Exit profiling entirely when encountering an error here - // as it indicates a serious problem or bug. - break; - } - configureHeartbeatMonitor( - heartbeatMonitor_, *config, on_demand_config.get()); - reconfigure = false; - restart = true; - } - - if (restart) { - now = system_clock::now(); - next_sample_time = now + profiler_->samplePeriod(); - next_report_time = now + profiler_->reportPeriod(); - if (profiler_->isOnDemandActive()) { - next_on_demand_report_time = now + profiler_->onDemandReportPeriod(); - } - next_multiplex_time = now + profiler_->multiplexPeriod(); - // Collect an initial sample and throw it away - // The next sample is the first valid one - profiler_->collectSample(); - profiler_->clearSamples(); - restart = false; - } - - auto start_sleep = now; - while (now < next_sample_time) { - /* sleep override */ - std::this_thread::sleep_for(next_sample_time - now); - now = system_clock::now(); - } - int sleep_time = duration_cast(now - start_sleep).count(); - - auto start_sample = now; - profiler_->collectSample(); - now = system_clock::now(); - int sample_time = duration_cast(now - start_sample).count(); - - next_sample_time += profiler_->samplePeriod(); - if (now > next_sample_time) { - reportLateSample(sleep_time, sample_time, 0, 0); - restart = true; - continue; - } - - auto start_report = now; - if (now > next_report_time) { - VLOG(1) << "Report #" << report_count++; - profiler_->reportSamples(); - next_report_time += profiler_->reportPeriod(); - } - if (profiler_->isOnDemandActive() && now > next_on_demand_report_time) { - VLOG(1) << "OnDemand Report #" << on_demand_report_count++; - profiler_->reportOnDemandSamples(); - next_on_demand_report_time += profiler_->onDemandReportPeriod(); - } - profiler_->eraseReportedSamples(); - now = system_clock::now(); - int report_time = duration_cast(now - start_report).count(); - - if (now > next_sample_time) { - reportLateSample(sleep_time, sample_time, report_time, 0); - restart = true; - continue; - } - - auto start_multiplex = now; - if (profiler_->multiplexEnabled() && now > next_multiplex_time) { - profiler_->enableNextCounterSet(); - next_multiplex_time += profiler_->multiplexPeriod(); - } - now = system_clock::now(); - int multiplex_time = - duration_cast(now - start_multiplex).count(); - - if (now > next_sample_time) { - reportLateSample(sleep_time, sample_time, report_time, multiplex_time); - restart = true; - } - - VLOG(0) << "Runloop execution time: " - << duration_cast(now - start_sample).count() << "ms"; - } - - VLOG(0) << "Device " << profiler_->device() - << ": Exited event profiling loop"; -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/EventProfilerController.h b/plugins/tensorboard-plugins/libkineto/src/EventProfilerController.h deleted file mode 100644 index 007a82faa92..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/EventProfilerController.h +++ /dev/null @@ -1,63 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include - -#include - -#include "ConfigLoader.h" - -namespace KINETO_NAMESPACE { - -class Config; -class ConfigLoader; -class EventProfiler; -class SampleListener; - -namespace detail { -class HeartbeatMonitor; -} - -class EventProfilerController : public ConfigLoader::ConfigHandler { - public: - EventProfilerController(const EventProfilerController&) = delete; - EventProfilerController& operator=(const EventProfilerController&) = delete; - - ~EventProfilerController(); - - static void start(CUcontext ctx, ConfigLoader& configLoader); - static void stop(CUcontext ctx); - - static void addLoggerFactory( - std::function(const Config&)> factory); - - static void addOnDemandLoggerFactory( - std::function(const Config&)> factory); - - bool canAcceptConfig() override; - - void acceptConfig(const Config& config) override; - - private: - explicit EventProfilerController( - CUcontext context, - ConfigLoader& configLoader, - detail::HeartbeatMonitor& heartbeatMonitor); - bool enableForDevice(Config& cfg); - void profilerLoop(); - - ConfigLoader& configLoader_; - std::unique_ptr newOnDemandConfig_; - detail::HeartbeatMonitor& heartbeatMonitor_; - std::unique_ptr profiler_; - std::unique_ptr profilerThread_; - std::atomic_bool stopRunloop_{false}; - std::mutex mutex_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/GenericTraceActivity.cpp b/plugins/tensorboard-plugins/libkineto/src/GenericTraceActivity.cpp deleted file mode 100644 index 4e00b1256c4..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/GenericTraceActivity.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "GenericTraceActivity.h" -#include "output_base.h" - -namespace libkineto { - void GenericTraceActivity::log(ActivityLogger& logger) const { - logger.handleGenericActivity(*this); - } -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/ILoggerObserver.cpp b/plugins/tensorboard-plugins/libkineto/src/ILoggerObserver.cpp deleted file mode 100644 index f0106578811..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ILoggerObserver.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "ILoggerObserver.h" - -#if !USE_GOOGLE_LOG - -#include -#include - -namespace libkineto { - -struct LoggerTypeName { - constexpr LoggerTypeName(const char* n, LoggerOutputType t) : name(n), type(t) {}; - const char* name; - LoggerOutputType type; -}; - -static constexpr std::array LoggerMap{{ - {"VERBOSE", LoggerOutputType::VERBOSE}, - {"INFO", LoggerOutputType::INFO}, - {"WARNING", LoggerOutputType::WARNING}, - {"ERROR", LoggerOutputType::ERROR}, - {"STAGE", LoggerOutputType::STAGE}, - {"???", LoggerOutputType::ENUM_COUNT} -}}; - -static constexpr bool matchingOrder(int idx = 0) { - return LoggerMap[idx].type == LoggerOutputType::ENUM_COUNT || - ((idx == (int) LoggerMap[idx].type) && matchingOrder(idx + 1)); -} -static_assert(matchingOrder(), "LoggerTypeName map is out of order"); - -const char* toString(LoggerOutputType t) { - if(t < VERBOSE || t >= ENUM_COUNT) { - return LoggerMap[ENUM_COUNT].name; - } - return LoggerMap[(int)t].name; -} - -LoggerOutputType toLoggerOutputType(const std::string& str) { - for (int i = 0; i < LoggerTypeCount; i++) { - if (str == LoggerMap[i].name) { - return LoggerMap[i].type; - } - } - throw std::invalid_argument(fmt::format("Invalid activity type: {}", str)); -} - -} // namespace libkineto - - -#endif // !USE_GOOGLE_LOG diff --git a/plugins/tensorboard-plugins/libkineto/src/Logger.cpp b/plugins/tensorboard-plugins/libkineto/src/Logger.cpp deleted file mode 100644 index dbde765f51f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/Logger.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "Logger.h" -#include "ILoggerObserver.h" - -#ifndef USE_GOOGLE_LOG - -#include -#include -#include -#include -#include - -#include -#include - -#include "ThreadUtil.h" - -namespace KINETO_NAMESPACE { - -std::atomic_int Logger::severityLevel_{VERBOSE}; -std::atomic_int Logger::verboseLogLevel_{-1}; -std::atomic Logger::verboseLogModules_{~0ull}; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wglobal-constructors" -std::mutex Logger::loggerObserversMutex_; -#pragma GCC diagnostic pop - - -Logger::Logger(int severity, int line, const char* filePath, int errnum) - : buf_(), out_(LIBKINETO_DBG_STREAM), errnum_(errnum), messageSeverity_(severity) { - buf_ << toString((LoggerOutputType) severity) << ":"; - - const auto tt = - std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); - const char* file = strrchr(filePath, '/'); - buf_ << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(tt)) << " " - << processId() << ":" << systemThreadId() << " " - << (file ? file + 1 : filePath) << ":" << line << "] "; -} - -Logger::~Logger() { -#ifdef __linux__ - if (errnum_ != 0) { - thread_local char buf[1024]; - buf_ << " : " << strerror_r(errnum_, buf, sizeof(buf)); - } -#endif - - { - std::lock_guard guard(loggerObserversMutex_); - for (auto* observer : loggerObservers()) { - // Output to observers. Current Severity helps keep track of which bucket the output goes. - if (observer) { - observer->write(buf_.str(), (LoggerOutputType) messageSeverity_); - } - } - } - - // Finally, print to terminal or console. - out_ << buf_.str() << std::endl; -} - -void Logger::setVerboseLogModules(const std::vector& modules) { - uint64_t mask = 0; - if (modules.empty()) { - mask = ~0ull; - } else { - for (const std::string& name : modules) { - mask |= hash(name.c_str()); - } - } - verboseLogModules_ = mask; -} - -void Logger::addLoggerObserver(ILoggerObserver* observer) { - if (observer == nullptr) { - return; - } - std::lock_guard guard(loggerObserversMutex_); - loggerObservers().insert(observer); -} - -void Logger::removeLoggerObserver(ILoggerObserver* observer) { - std::lock_guard guard(loggerObserversMutex_); - loggerObservers().erase(observer); -} - -void Logger::addLoggerObserverDevice(int64_t device) { - std::lock_guard guard(loggerObserversMutex_); - for (auto observer : loggerObservers()) { - observer->addDevice(device); - } -} - -void Logger::addLoggerObserverEventCount(int64_t count) { - std::lock_guard guard(loggerObserversMutex_); - for (auto observer : loggerObservers()) { - observer->addEventCount(count); - } -} - -void Logger::setLoggerObserverTraceDurationMS(int64_t duration) { - std::lock_guard guard(loggerObserversMutex_); - for (auto observer : loggerObservers()) { - observer->setTraceDurationMS(duration); - } -} - -void Logger::setLoggerObserverTraceID(const std::string& tid) { - std::lock_guard guard(loggerObserversMutex_); - for (auto observer : loggerObservers()) { - observer->setTraceID(tid); - } -} - -void Logger::setLoggerObserverGroupTraceID(const std::string& gtid) { - std::lock_guard guard(loggerObserversMutex_); - for (auto observer : loggerObservers()) { - observer->setGroupTraceID(gtid); - } -} - -void Logger::addLoggerObserverDestination(const std::string& dest) { - std::lock_guard guard(loggerObserversMutex_); - for (auto observer : loggerObservers()) { - observer->addDestination(dest); - } -} - -} // namespace KINETO_NAMESPACE - -#endif // USE_GOOGLE_LOG diff --git a/plugins/tensorboard-plugins/libkineto/src/Logger.h b/plugins/tensorboard-plugins/libkineto/src/Logger.h deleted file mode 100644 index 868fc84b9f4..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/Logger.h +++ /dev/null @@ -1,244 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -#define LIBKINETO_DBG_STREAM std::cerr - -#if USE_GOOGLE_LOG - -#include - -#define SET_LOG_SEVERITY_LEVEL(level) -#define SET_LOG_VERBOSITY_LEVEL(level, modules) -#define LOGGER_OBSERVER_ADD_DEVICE(device) -#define LOGGER_OBSERVER_ADD_EVENT_COUNT(count) -#define LOGGER_OBSERVER_SET_TRACE_DURATION_MS(duration) -#define LOGGER_OBSERVER_SET_TRACE_ID(tid) -#define LOGGER_OBSERVER_SET_GROUP_TRACE_ID(gtid) -#define LOGGER_OBSERVER_ADD_DESTINATION(dest) -#define UST_LOGGER_MARK_COMPLETED(stage) - -#else // !USE_GOOGLE_LOG -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "ILoggerObserver.h" - -#ifdef _MSC_VER -// unset a predefined ERROR (windows) -#undef ERROR -#endif // _MSC_VER - -namespace KINETO_NAMESPACE { - -class Logger { - public: - Logger(int severity, int line, const char* filePath, int errnum = 0); - ~Logger(); - - inline std::ostream& stream() { - return buf_; - } - - static inline void setSeverityLevel(int level) { - severityLevel_ = level; - } - - static inline int severityLevel() { - return severityLevel_; - } - - static inline void setVerboseLogLevel(int level) { - verboseLogLevel_ = level; - } - - static inline int verboseLogLevel() { - return verboseLogLevel_; - } - - // This is constexpr so that the hash for a file name is computed at compile - // time when used in the VLOG macros. - // This way, there is no string comparison for matching VLOG modules, - // only a comparison of pre-computed hashes. - // No fancy hashing needed here. It's pretty inefficient (one character - // at a time) but the strings are not large and it's not in the critical path. - static constexpr uint64_t rol(uint64_t val, int amount) { - return val << amount | val >> (63 - amount); - } - static constexpr uint64_t hash(const char* s) { - uint64_t hash = hash_rec(s, 0); - return hash & rol(0x41a0240682483014ull, hash & 63); - } - static constexpr uint64_t hash_rec(const char* s, int off) { - // Random constants! - return (!s[off] ? 57ull : (hash_rec(s, off + 1) * 293) ^ s[off]); - } - static constexpr const char* basename(const char* s, int off = 0) { - return !s[off] - ? s - : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1); - } - - static void setVerboseLogModules(const std::vector& modules); - - static inline uint64_t verboseLogModules() { - return verboseLogModules_; - } - - static void clearLoggerObservers() { - std::lock_guard g(loggerObserversMutex_); - loggerObservers().clear(); - } - - static void addLoggerObserver(ILoggerObserver* observer); - - static void removeLoggerObserver(ILoggerObserver* observer); - - static void addLoggerObserverDevice(int64_t device); - - static void addLoggerObserverEventCount(int64_t count); - - static void setLoggerObserverTraceDurationMS(int64_t duration); - - static void setLoggerObserverTraceID(const std::string& tid); - - static void setLoggerObserverGroupTraceID(const std::string& gtid); - - static void addLoggerObserverDestination(const std::string& dest); - - private: - std::stringstream buf_; - std::ostream& out_; - int errnum_; - int messageSeverity_; - static std::atomic_int severityLevel_; - static std::atomic_int verboseLogLevel_; - static std::atomic verboseLogModules_; - static std::set& loggerObservers() { - static auto* inst = new std::set(); - return *inst; - } - static std::mutex loggerObserversMutex_; -}; - -class VoidLogger { - public: - VoidLogger() {} - void operator&(std::ostream&) {} -}; - -} // namespace KINETO_NAMESPACE - -#ifdef LOG // Undefine in case these are already defined (quite likely) -#undef LOG -#undef LOG_IS_ON -#undef LOG_IF -#undef LOG_EVERY_N -#undef LOG_IF_EVERY_N -#undef DLOG -#undef DLOG_IF -#undef VLOG -#undef VLOG_IF -#undef VLOG_EVERY_N -#undef VLOG_IS_ON -#undef DVLOG -#undef LOG_FIRST_N -#undef CHECK -#undef DCHECK -#undef DCHECK_EQ -#undef PLOG -#undef PCHECK -#undef LOG_OCCURRENCES -#endif - -#define LOG_IS_ON(severity) \ - (severity >= libkineto::Logger::severityLevel()) - -#define LOG_IF(severity, condition) \ - !(LOG_IS_ON(severity) && (condition)) ? (void)0 : libkineto::VoidLogger() & \ - libkineto::Logger(severity, __LINE__, __FILE__).stream() - -#define LOG(severity) LOG_IF(severity, true) - -#define LOCAL_VARNAME_CONCAT(name, suffix) _##name##suffix##_ - -#define LOCAL_VARNAME(name) LOCAL_VARNAME_CONCAT(name, __LINE__) - -#define LOG_OCCURRENCES LOCAL_VARNAME(log_count) - -#define LOG_EVERY_N(severity, rate) \ - static int LOG_OCCURRENCES = 0; \ - LOG_IF(severity, LOG_OCCURRENCES++ % rate == 0) \ - << "(x" << LOG_OCCURRENCES << ") " - -template -struct __to_constant__ { - static const uint64_t val = n; -}; -#define FILENAME_HASH \ - __to_constant__::val -#define VLOG_IS_ON(verbosity) \ - (libkineto::Logger::verboseLogLevel() >= verbosity && \ - (libkineto::Logger::verboseLogModules() & FILENAME_HASH) == FILENAME_HASH) - -#define VLOG_IF(verbosity, condition) \ - LOG_IF(VERBOSE, VLOG_IS_ON(verbosity) && (condition)) - -#define VLOG(verbosity) VLOG_IF(verbosity, true) - -#define VLOG_EVERY_N(verbosity, rate) \ - static int LOG_OCCURRENCES = 0; \ - VLOG_IF(verbosity, LOG_OCCURRENCES++ % rate == 0) \ - << "(x" << LOG_OCCURRENCES << ") " - -#define PLOG(severity) \ - libkineto::Logger(severity, __LINE__, __FILE__, errno).stream() - -#define SET_LOG_SEVERITY_LEVEL(level) \ - libkineto::Logger::setSeverityLevel(level) - -#define SET_LOG_VERBOSITY_LEVEL(level, modules) \ - libkineto::Logger::setVerboseLogLevel(level); \ - libkineto::Logger::setVerboseLogModules(modules) - -// Logging the set of devices the trace is collect on. -#define LOGGER_OBSERVER_ADD_DEVICE(device_count) \ - libkineto::Logger::addLoggerObserverDevice(device_count) - -// Incrementing the number of events collected by this trace. -#define LOGGER_OBSERVER_ADD_EVENT_COUNT(count) \ - libkineto::Logger::addLoggerObserverEventCount(count) - -// Record duration of trace in milliseconds. -#define LOGGER_OBSERVER_SET_TRACE_DURATION_MS(duration) \ - libkineto::Logger::setLoggerObserverTraceDurationMS(duration) - -// Record the trace id when given. -#define LOGGER_OBSERVER_SET_TRACE_ID(tid) \ - libkineto::Logger::setLoggerObserverTraceID(tid) - -// Record the group trace id when given. -#define LOGGER_OBSERVER_SET_GROUP_TRACE_ID(gtid) \ - libkineto::Logger::setLoggerObserverGroupTraceID(gtid) - -// Log the set of destinations the trace is sent to. -#define LOGGER_OBSERVER_ADD_DESTINATION(dest) \ - libkineto::Logger::addLoggerObserverDestination(dest) - -// UST Logger Semantics to describe when a stage is complete. -#define UST_LOGGER_MARK_COMPLETED(stage) \ - LOG(libkineto::LoggerOutputType::STAGE) << "Completed Stage: " << stage - -#endif // USE_GOOGLE_LOG diff --git a/plugins/tensorboard-plugins/libkineto/src/LoggerCollector.h b/plugins/tensorboard-plugins/libkineto/src/LoggerCollector.h deleted file mode 100644 index bb05aab218d..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/LoggerCollector.h +++ /dev/null @@ -1,70 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#if !USE_GOOGLE_LOG - -#include -#include -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "ILoggerObserver.h" - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -class LoggerCollector : public ILoggerObserver { - public: - LoggerCollector() : buckets_() {} - - void write(const std::string& message, LoggerOutputType ot = ERROR) override { - // Skip STAGE output type which is only used by USTLoggerCollector. - if (ot != STAGE) { - buckets_[ot].push_back(message); - } - } - - const std::map> extractCollectorMetadata() override { - return buckets_; - } - - void reset() override { - trace_duration_ms = 0; - event_count = 0; - destinations.clear(); - } - - void addDevice(const int64_t device) override { - devices.insert(device); - } - - void setTraceDurationMS(const int64_t duration) override { - trace_duration_ms = duration; - } - - void addEventCount(const int64_t count) override { - event_count += count; - } - - void addDestination(const std::string& dest) override { - destinations.insert(dest); - } - - protected: - std::map> buckets_; - - // These are useful metadata to collect from CUPTIActivityProfiler for internal tracking. - std::set devices; - int64_t trace_duration_ms{0}; - std::atomic event_count{0}; - std::set destinations; - -}; - -} // namespace KINETO_NAMESPACE - -#endif // !USE_GOOGLE_LOG diff --git a/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.cpp b/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.cpp deleted file mode 100644 index 73eff13e2a0..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.cpp +++ /dev/null @@ -1,569 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "RoctracerActivityApi.h" - -#include -#include -#include - -#include "Demangle.h" -#include "output_base.h" -#include "ThreadUtil.h" - -typedef uint64_t timestamp_t; - -static timestamp_t timespec_to_ns(const timespec& time) { - return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; - } - -using namespace std::chrono; - -namespace KINETO_NAMESPACE { - -constexpr size_t kBufSize(2 * 1024 * 1024); - -RoctracerActivityApi& RoctracerActivityApi::singleton() { - static RoctracerActivityApi instance; - return instance; -} - -RoctracerActivityApi::RoctracerActivityApi() { - gpuTraceBuffers_ = std::make_unique>(); -} - -RoctracerActivityApi::~RoctracerActivityApi() { - disableActivities(std::set()); - endTracing(); -} - -void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { -#ifdef HAS_ROCTRACER - if (!singleton().externalCorrelationEnabled_) { - return; - } - // placeholder -#endif -} - -void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) { -#ifdef HAS_ROCTRACER - if (!singleton().externalCorrelationEnabled_) { - return; - } - // placeholder -#endif -} - -void RoctracerActivityApi::setMaxBufferSize(int size) { - maxGpuBufferCount_ = 1 + size / kBufSize; -} - -int RoctracerActivityApi::processActivities( - ActivityLogger& logger) { - // Find offset to map from monotonic clock to system clock. - // This will break time-ordering of events but is status quo. - - timespec t0, t1, t00; - clock_gettime(CLOCK_REALTIME, &t0); - clock_gettime(CLOCK_MONOTONIC, &t1); - clock_gettime(CLOCK_REALTIME, &t00); - - const timestamp_t toffset = (timespec_to_ns(t0) >> 1) + (timespec_to_ns(t00) >> 1) - timespec_to_ns(t1); - - int count = 0; - - // Basic Api calls - - for (auto &item : rows_) { - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::CUDA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - logger.handleGenericActivity(a); - ++count; - } - - // Malloc/Free calls - for (auto &item : mallocRows_) { - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::CUDA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - a.addMetadata("ptr", item.ptr); - if (item.cid == HIP_API_ID_hipMalloc) { - a.addMetadata("size", item.size); - } - - logger.handleGenericActivity(a); - ++count; - } - - // HipMemcpy calls - for (auto &item : copyRows_) { - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::CUDA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - a.addMetadata("src", item.src); - a.addMetadata("dst", item.dst); - a.addMetadata("size", item.size); - a.addMetadata("kind", item.kind); - if ((item.cid == HIP_API_ID_hipMemcpyAsync) || (item.cid == HIP_API_ID_hipMemcpyWithStream)) { - a.addMetadata("stream", fmt::format("{}", reinterpret_cast(item.stream))); - } - - logger.handleGenericActivity(a); - ++count; - } - - // Kernel Launch Api calls - - for (auto &item : kernelRows_) { - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::CUDA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - if (item.functionAddr != nullptr) { - a.addMetadataQuoted( - "kernel", demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream))); - } - else if (item.function != nullptr) { - a.addMetadataQuoted( - "kernel", demangle(hipKernelNameRef(item.function))); - } - a.addMetadata("grid dim", fmt::format("[{}, {}, {}]", item.gridX, item.gridY, item.gridZ)); - a.addMetadata("block dim", fmt::format("[{}, {}, {}]", item.workgroupX, item.workgroupY, item.workgroupZ)); - a.addMetadata("shared size", item.groupSegmentSize); - a.addMetadata("stream", fmt::format("{}", reinterpret_cast(item.stream))); - - // Stash launches to tie to the async ops - kernelLaunches_[a.id] = a; - - // Stash kernel names to tie to the async ops - std::string name; - if (item.functionAddr != nullptr) { - name = demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream)); - } - else if (item.function != nullptr) { - name = demangle(hipKernelNameRef(item.function)); - } - if (!name.empty()) { - uint32_t string_id = reverseStrings_[name]; - if (string_id == 0) { - string_id = nextStringId_++; - reverseStrings_[name] = string_id; - strings_[string_id] = name; - } - kernelNames_[item.id] = string_id; - } - - logger.handleGenericActivity(a); - ++count; - } - - // Async Ops - - for (auto& buffer : *gpuTraceBuffers_) { - const roctracer_record_t* record = (const roctracer_record_t*)(buffer.data); - const roctracer_record_t* end_record = (const roctracer_record_t*)(buffer.data + buffer.validSize); - GenericTraceActivity a; - - while (record < end_record) { - if ((record->domain == ACTIVITY_DOMAIN_HIP_API) && (loggedIds_.contains(record->op))) { - const char *name = roctracer_op_string(record->domain, record->op, record->kind); - a.device = record->process_id; - a.resource = record->thread_id; - - a.startTime = (record->begin_ns + toffset) / 1000; - a.endTime = (record->end_ns + toffset) / 1000; - a.id = record->correlation_id; - - a.activityType = ActivityType::CUDA_RUNTIME; - a.activityName = std::string(name); - a.flow.id = record->correlation_id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - logger.handleGenericActivity(a); - ++count; - } - else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) { - // Overlay launch metadata for kernels - auto kit = kernelLaunches_.find(record->correlation_id); - if (kit != kernelLaunches_.end()) { - a = (*kit).second; - } - - const char *name = roctracer_op_string(record->domain, record->op, record->kind); - a.device = record->device_id; - a.resource = record->queue_id; - - a.startTime = (record->begin_ns + toffset) / 1000; - a.endTime = (record->end_ns + toffset) / 1000; - a.id = record->correlation_id; - - a.activityType = ActivityType::CONCURRENT_KERNEL; - a.activityName = std::string(name); - a.flow.id = record->correlation_id; - a.flow.type = kLinkAsyncCpuGpu; - - auto it = kernelNames_.find(record->correlation_id); - if (it != kernelNames_.end()) { - a.activityName = strings_[it->second]; - } - - logger.handleGenericActivity(a); - ++count; - } - - roctracer_next_record(record, &record); - } - } - return count; -} - -void RoctracerActivityApi::clearActivities() { - gpuTraceBuffers_->clear(); - rows_.clear(); - kernelRows_.clear(); - copyRows_.clear(); - mallocRows_.clear(); - kernelLaunches_.clear(); -} - -void RoctracerActivityApi::api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) -{ - RoctracerActivityApi *dis = &singleton(); - - if (domain == ACTIVITY_DOMAIN_HIP_API && dis->loggedIds_.contains(cid)) { - const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); - - // Pack callbacks into row structures - - static timespec timestamp; // FIXME verify thread safety - - if (data->phase == ACTIVITY_API_PHASE_ENTER) { - clock_gettime(CLOCK_MONOTONIC, ×tamp); // record proper clock - } - else { // (data->phase == ACTIVITY_API_PHASE_EXIT) - timespec endTime; - timespec startTime { timestamp }; - clock_gettime(CLOCK_MONOTONIC, &endTime); // record proper clock - - switch (cid) { - case HIP_API_ID_hipLaunchKernel: - case HIP_API_ID_hipExtLaunchKernel: - case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here - { - auto &args = data->args.hipLaunchKernel; - dis->kernelRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.function_address, - nullptr, - args.numBlocks.x, - args.numBlocks.y, - args.numBlocks.z, - args.dimBlocks.x, - args.dimBlocks.y, - args.dimBlocks.z, - args.sharedMemBytes, - args.stream - ); - } - break; - case HIP_API_ID_hipHccModuleLaunchKernel: - case HIP_API_ID_hipModuleLaunchKernel: - case HIP_API_ID_hipExtModuleLaunchKernel: - { - auto &args = data->args.hipModuleLaunchKernel; - dis->kernelRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - nullptr, - args.f, - args.gridDimX, - args.gridDimY, - args.gridDimZ, - args.blockDimX, - args.blockDimY, - args.blockDimZ, - args.sharedMemBytes, - args.stream - ); - } - break; - case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: - case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: -#if 0 - { - auto &args = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val; - dis->kernelRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.function_address, - nullptr, - args.numBlocks.x, - args.numBlocks.y, - args.numBlocks.z, - args.dimBlocks.x, - args.dimBlocks.y, - args.dimBlocks.z, - args.sharedMemBytes, - args.stream - ); - } -#endif - break; - case HIP_API_ID_hipMalloc: - dis->mallocRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - data->args.hipMalloc.ptr__val, - data->args.hipMalloc.size - ); - break; - case HIP_API_ID_hipFree: - dis->mallocRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - data->args.hipFree.ptr, - 0 - ); - break; - case HIP_API_ID_hipMemcpy: - { - auto &args = data->args.hipMemcpy; - dis->copyRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.src, - args.dst, - args.sizeBytes, - args.kind, - static_cast(0) // use placeholder? - ); - } - break; - case HIP_API_ID_hipMemcpyAsync: - case HIP_API_ID_hipMemcpyWithStream: - { - auto &args = data->args.hipMemcpyAsync; - dis->copyRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.src, - args.dst, - args.sizeBytes, - args.kind, - args.stream - ); - } - break; - default: - dis->rows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime) - ); - break; - } - } - } -} - -void RoctracerActivityApi::activity_callback(const char* begin, const char* end, void* arg) -{ - size_t size = end - begin; - uint8_t *buffer = (uint8_t*) malloc(size); - auto &gpuTraceBuffers = singleton().gpuTraceBuffers_; - memcpy(buffer, begin, size); - gpuTraceBuffers->emplace_back(buffer, size); -} - -void RoctracerActivityApi::enableActivities( - const std::set& selected_activities) { -#ifdef HAS_ROCTRACER - if (!registered_) { - roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr); // Magic encantation - - // Set some api calls to ignore - loggedIds_.setInvertMode(true); // Omit the specified api - loggedIds_.add("hipGetDevice"); - loggedIds_.add("hipSetDevice"); - loggedIds_.add("hipGetLastError"); - loggedIds_.add("__hipPushCallConfiguration"); - loggedIds_.add("__hipPopCallConfiguration"); - loggedIds_.add("hipCtxSetCurrent"); - loggedIds_.add("hipEventRecord"); - loggedIds_.add("hipEventQuery"); - loggedIds_.add("hipGetDeviceProperties"); - loggedIds_.add("hipPeekAtLastError"); - loggedIds_.add("hipModuleGetFunction"); - loggedIds_.add("hipEventCreateWithFlags"); - - // Enable API callbacks - if (loggedIds_.invertMode() == true) { - // exclusion list - enable entire domain and turn off things in list - roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, api_callback, nullptr); - const std::unordered_map &filter = loggedIds_.filterList(); - for (auto it = filter.begin(); it != filter.end(); ++it) { - roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first); - } - } - else { - // inclusion list - only enable things in the list - const std::unordered_map &filter = loggedIds_.filterList(); - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); - for (auto it = filter.begin(); it != filter.end(); ++it) { - roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first, api_callback, nullptr); - } - } - //roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, nullptr); - - // Allocate default tracing pool - roctracer_properties_t properties; - memset(&properties, 0, sizeof(roctracer_properties_t)); - properties.buffer_size = 0x1000; - roctracer_open_pool(&properties); - - // Enable async op collection - roctracer_properties_t hcc_cb_properties; - memset(&hcc_cb_properties, 0, sizeof(roctracer_properties_t)); - hcc_cb_properties.buffer_size = 0x4000; - hcc_cb_properties.buffer_callback_fun = activity_callback; - roctracer_open_pool_expl(&hcc_cb_properties, &hccPool_); - roctracer_enable_domain_activity_expl(ACTIVITY_DOMAIN_HCC_OPS, hccPool_); - - registered_ = true; - } - - for (const auto& activity : selected_activities) { - if (activity == ActivityType::EXTERNAL_CORRELATION) { - externalCorrelationEnabled_ = true; - } - } - - roctracer_start(); -#endif -} - -void RoctracerActivityApi::disableActivities( - const std::set& selected_activities) { -#ifdef HAS_ROCTRACER - roctracer_stop(); - roctracer_flush_activity_expl(hccPool_); - - for (const auto& activity : selected_activities) { - if (activity == ActivityType::EXTERNAL_CORRELATION) { - externalCorrelationEnabled_ = false; - } - } -#endif -} - -void RoctracerActivityApi::endTracing() { - if (registered_ == true) { - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); - //roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX); - - roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS); - roctracer_close_pool_expl(hccPool_); - } -} - - -ApiIdList::ApiIdList() -: invert_(true) -{ -} - -void ApiIdList::add(std::string apiName) -{ - uint32_t cid = 0; - if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) { - filter_[cid] = 1; - } -} -void ApiIdList::remove(std::string apiName) -{ - uint32_t cid = 0; - if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) { - filter_.erase(cid); - } -} - -bool ApiIdList::loadUserPrefs() -{ - // placeholder - return false; -} -bool ApiIdList::contains(uint32_t apiId) -{ - return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.h b/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.h deleted file mode 100644 index 28280253e7c..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityApi.h +++ /dev/null @@ -1,171 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef HAS_ROCTRACER -#include -#include -#include -#include -#include -#endif - -#include "ActivityType.h" -#include "GenericTraceActivity.h" -#include "RoctracerActivityBuffer.h" - - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -class ApiIdList -{ -public: - ApiIdList(); - bool invertMode() { return invert_; } - void setInvertMode(bool invert) { invert_ = invert; } - void add(std::string apiName); - void remove(std::string apiName); - bool loadUserPrefs(); - bool contains(uint32_t apiId); - const std::unordered_map &filterList() { return filter_; } - -private: - std::unordered_map filter_; - bool invert_; -}; - -struct roctracerRow { - roctracerRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end) - : id(id), domain(domain), cid(cid), pid(pid), tid(tid), begin(begin), end(end) {} - uint64_t id; // correlation_id - uint32_t domain; - uint32_t cid; - uint32_t pid; - uint32_t tid; - uint64_t begin; - uint64_t end; -}; - -struct kernelRow : public roctracerRow { - kernelRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end - , const void *faddr, hipFunction_t function - , unsigned int gx, unsigned int gy, unsigned int gz - , unsigned int wx, unsigned int wy, unsigned int wz - , size_t gss, hipStream_t stream) - : roctracerRow(id, domain, cid, pid, tid, begin, end), functionAddr(faddr) - , function(function), gridX(gx), gridY(gy), gridZ(gz) - , workgroupX(wx), workgroupY(wy), workgroupZ(wz), groupSegmentSize(gss) - , stream(stream) {} - const void* functionAddr; - hipFunction_t function; - unsigned int gridX; - unsigned int gridY; - unsigned int gridZ; - unsigned int workgroupX; - unsigned int workgroupY; - unsigned int workgroupZ; - size_t groupSegmentSize; - hipStream_t stream; -}; - -struct copyRow : public roctracerRow { - copyRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end - , const void* src, const void *dst, size_t size, hipMemcpyKind kind - , hipStream_t stream) - : roctracerRow(id, domain, cid, pid, tid, begin, end) - , src(src), dst(dst), size(size), kind(kind), stream(stream) {} - const void *src; - const void *dst; - size_t size; - hipMemcpyKind kind; - hipStream_t stream; -}; - -struct mallocRow : public roctracerRow { - mallocRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end - , const void* ptr, size_t size) - : roctracerRow(id, domain, cid, pid, tid, begin, end) - , ptr(ptr), size(size) {} - const void *ptr; - size_t size; -}; - - -class RoctracerActivityApi { - public: - enum CorrelationFlowType { - Default, - User - }; - - RoctracerActivityApi(); - RoctracerActivityApi(const RoctracerActivityApi&) = delete; - RoctracerActivityApi& operator=(const RoctracerActivityApi&) = delete; - - virtual ~RoctracerActivityApi(); - - static RoctracerActivityApi& singleton(); - - static void pushCorrelationID(int id, CorrelationFlowType type); - static void popCorrelationID(CorrelationFlowType type); - - void enableActivities( - const std::set& selected_activities); - void disableActivities( - const std::set& selected_activities); - void clearActivities(); - - int processActivities(ActivityLogger& logger); - - void setMaxBufferSize(int size); - - std::atomic_bool stopCollection{false}; - - private: - bool registered_{false}; - void endTracing(); - -#ifdef HAS_ROCTRACER - roctracer_pool_t *hccPool_{NULL}; - static void api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg); - static void activity_callback(const char* begin, const char* end, void* arg); - - //Name cache - uint32_t nextStringId_{2}; - std::map strings_; - std::map reverseStrings_; - std::map kernelNames_; - - ApiIdList loggedIds_; - - // Api callback data - std::deque rows_; - std::deque kernelRows_; - std::deque copyRows_; - std::deque mallocRows_; - std::map kernelLaunches_; -#endif - - int maxGpuBufferCount_{0}; - std::unique_ptr> gpuTraceBuffers_; - bool externalCorrelationEnabled_{true}; -}; - -} // namespace KINETO_NAMESPACE - diff --git a/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityBuffer.h b/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityBuffer.h deleted file mode 100644 index cd8a5709a84..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/RoctracerActivityBuffer.h +++ /dev/null @@ -1,30 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -class RoctracerActivityBuffer { - public: - // data must be allocated using malloc. - // Ownership is transferred to this object. - RoctracerActivityBuffer(uint8_t* data, size_t validSize) - : data(data), validSize(validSize) {} - - ~RoctracerActivityBuffer() { - free(data); - } - - // Allocated by malloc - uint8_t* data{nullptr}; - - // Number of bytes used - size_t validSize; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/SampleListener.h b/plugins/tensorboard-plugins/libkineto/src/SampleListener.h deleted file mode 100644 index bff86ad122a..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/SampleListener.h +++ /dev/null @@ -1,146 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -class Config; - -class SampleValue { - public: - template - explicit SampleValue(T v) { - init(v); - } - - SampleValue(const SampleValue&) = default; - SampleValue& operator=(const SampleValue&) = delete; - SampleValue(SampleValue&&) = default; - SampleValue& operator=(SampleValue&&) = default; - - bool isInt() const { - return type_ == INT64; - } - - int64_t getInt() const { - assert(isInt()); - return int_; - } - - bool isDouble() const { - return type_ == DOUBLE; - } - - double getDouble() const { - assert(isDouble()); - return dbl_; - } - - inline void operator*=(double x) { - assert(isDouble() || isInt()); - if (isDouble()) { - dbl_ *= x; - } else { - int_ = std::round(int_ * x); - } - } - - inline bool operator<(const SampleValue& o) const { - if (type_ != o.type_) { - return type_ < o.type_; - } else if (type_ == INT64) { - return int_ < o.int_; - } else if (type_ == DOUBLE) { - return dbl_ < o.dbl_; - } - assert(false); - return true; - } - - void print(std::ostream& s) const { - if (type_ == INT64) { - s << int_; - } else if (type_ == DOUBLE) { - s << dbl_; - } else { - assert(false); - } - } - - private: - enum Type { INT64, DOUBLE }; - - template - void init(T v); - - Type type_{INT64}; - union { - int64_t int_{0}; - double dbl_; - }; -}; - -template <> -inline void SampleValue::init(uint64_t v) { - int_ = v, type_ = INT64; -} -template <> -inline void SampleValue::init(int64_t v) { - int_ = v, type_ = INT64; -} -template <> -inline void SampleValue::init(int v) { - int_ = v, type_ = INT64; -} -template <> -inline void SampleValue::init(double v) { - dbl_ = v, type_ = DOUBLE; -} - -inline std::ostream& operator<<(std::ostream& out, const SampleValue& s) { - s.print(out); - return out; -} - -using PercentileList = std::vector>; - -struct Stat { - const std::string& name; - const PercentileList percentileValues; - SampleValue total; -}; - -struct Sample { - Sample(int stats_count) { - stats.reserve(stats_count); - } - - // Offset in milliseconds from first sample in report - int deltaMsec; - std::vector stats; -}; - -// Inherit from this to be notified of samples -class SampleListener { - public: - SampleListener(const SampleListener&) = delete; - SampleListener& operator=(const SampleListener&) = delete; - - virtual ~SampleListener(){}; - - // Report bucketed & aggregated values for event - virtual void handleSample(int device, const Sample& sample, bool from_new_version) = 0; - - virtual void update(const Config& config) = 0; - - protected: - SampleListener() = default; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/ScopeExit.h b/plugins/tensorboard-plugins/libkineto/src/ScopeExit.h deleted file mode 100644 index b9a6bc83ef9..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ScopeExit.h +++ /dev/null @@ -1,29 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -// Implement a simple scope handler allowing a function to release -// resources when an error or exception occurs - -template -class ScopeExit { - public: - explicit ScopeExit(T t) : t(t) {} - ~ScopeExit() { - t(); - } - T t; -}; - -template -ScopeExit makeScopeExit(T t) { - return ScopeExit(t); -}; - -// Add a level of indirection so __LINE__ is expanded -#define __kINETO_CONCAT(name, line) name##line -#define ANON_VAR(name, line) __kINETO_CONCAT(name, line) - -#define SCOPE_EXIT(func) \ - const auto ANON_VAR(SCOPE_BLOCK, __LINE__) = \ - makeScopeExit([=]() { func; }) diff --git a/plugins/tensorboard-plugins/libkineto/src/ThreadUtil.cpp b/plugins/tensorboard-plugins/libkineto/src/ThreadUtil.cpp deleted file mode 100644 index 0f67d54d585..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/ThreadUtil.cpp +++ /dev/null @@ -1,203 +0,0 @@ -#include "ThreadUtil.h" - -#ifndef _MSC_VER -#include -#include -#include -#include -#else // _MSC_VER -#include -#include -#define WIN32_LEAN_AND_MEAN -#define NOGDI -#include -#include -#undef ERROR -#endif // _MSC_VER - -#ifdef __ANDROID__ -#include -#endif - -#include -#include -#include - -namespace libkineto { - -namespace { -thread_local int32_t _pid = 0; -thread_local int32_t _tid = 0; -thread_local int32_t _sysTid = 0; -} - -int32_t processId() { - if (!_pid) { -#ifndef _MSC_VER - _pid = (int32_t)getpid(); -#else - _pid = (int32_t)GetCurrentProcessId(); -#endif - } - return _pid; -} - -int32_t systemThreadId() { - if (!_sysTid) { -#ifdef __APPLE__ - _sysTid = (int32_t)syscall(SYS_thread_selfid); -#elif defined _MSC_VER - _sysTid = (int32_t)GetCurrentThreadId(); -#else - _sysTid = (int32_t)syscall(SYS_gettid); -#endif - } - return _sysTid; -} - -int32_t threadId() { - if (!_tid) { -#ifdef __APPLE__ - uint64_t tid; - pthread_threadid_np(nullptr, &tid); - _tid = tid; -#elif defined _MSC_VER - _tid = (int32_t)GetCurrentThreadId(); -#else - pthread_t pth = pthread_self(); - int32_t* ptr = reinterpret_cast(&pth); - _tid = *ptr; -#endif - } - return _tid; -} - -namespace { -static constexpr size_t kMaxThreadNameLength = 16; - -static constexpr const char* basename(const char* s, int off = 0) { - return !s[off] - ? s - : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1); -} -#if defined(_MSC_VER) -void *getKernel32Func(const char* procName) { - return GetProcAddress(GetModuleHandleA("KERNEL32.DLL"), procName); -} -#endif -} - -bool setThreadName(const std::string& name) { -#ifdef __APPLE__ - return 0 == pthread_setname_np(name.c_str()); -#elif defined _MSC_VER - // Per https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreaddescription - // Use runtime linking to set thread description - static auto _SetThreadDescription = reinterpret_cast(getKernel32Func("SetThreadDescription")); - if (!_SetThreadDescription) { - return false; - } - std::wstring_convert> conv; - std::wstring wname = conv.from_bytes(name); - HRESULT hr = _SetThreadDescription(GetCurrentThread(), wname.c_str()); - return SUCCEEDED(hr); -#else - return 0 == pthread_setname_np(pthread_self(), name.c_str()); -#endif -} - -std::string getThreadName() { -#ifndef _MSC_VER - char buf[kMaxThreadNameLength] = ""; - if ( -#ifndef __ANDROID__ - pthread_getname_np(pthread_self(), buf, kMaxThreadNameLength) != 0 -#else - prctl(PR_GET_NAME, buf, kMaxThreadNameLength) != 0 -#endif - ) { - return "Unknown"; - } - return buf; -#else // _MSC_VER - static auto _GetThreadDescription = reinterpret_cast(getKernel32Func("GetThreadDescription")); - if (!_GetThreadDescription) { - return "Unknown"; - } - PWSTR data; - HRESULT hr = _GetThreadDescription(GetCurrentThread(), &data); - if (!SUCCEEDED(hr)) { - return ""; - } - std::wstring_convert> conv; - std::string name = conv.to_bytes(data); - LocalFree(data); - return name; -#endif -} - -// Linux: -// Extract process name from /proc/pid/cmdline. This does not have -// the 16 character limit that /proc/pid/status and /prod/pid/comm has. -std::string processName(int32_t pid) { -#ifdef __linux__ - FILE* cmdfile = fopen(fmt::format("/proc/{}/cmdline", pid).c_str(), "r"); - if (cmdfile != nullptr) { - char* command = nullptr; - int scanned = fscanf(cmdfile, "%ms", &command); - fclose(cmdfile); - if (scanned > 0 && command) { - std::string ret(basename(command)); - free(command); - return ret; - } - } - std::cerr << "Failed to read process name for pid " << pid << std::endl; -#endif - return ""; -} - -// Max number of parent pids to collect, just for extra safeguarding. -constexpr int kMaxParentPids = 10; - -// Return a pair of -static std::pair parentPidAndCommand(int32_t pid) { -#ifdef __linux__ - FILE* statfile = fopen(fmt::format("/proc/{}/stat", pid).c_str(), "r"); - if (statfile == nullptr) { - return std::make_pair(0, ""); - } - int32_t parent_pid; - char* command = nullptr; - int scanned = fscanf(statfile, "%*d (%m[^)]) %*c %d", &command, &parent_pid); - fclose(statfile); - std::pair ret; - if (scanned == 2) { - ret = std::make_pair(parent_pid, std::string(command)); - } else { - std::cerr << "Failed to parse /proc/" << pid << "/stat" << std::endl; - ret = std::make_pair(0, ""); - } - - // The 'm' character in the format tells fscanf to allocate memory - // for the parsed string, which we need to free here. - free(command); - return ret; -#else - return std::make_pair(0, ""); -#endif -} - -std::vector> pidCommandPairsOfAncestors() { - std::vector> pairs; - pairs.reserve(kMaxParentPids + 1); - int32_t curr_pid = processId(); - for (int i = 0; i <= kMaxParentPids && curr_pid > 1; i++) { - std::pair ppid_and_comm = parentPidAndCommand(curr_pid); - pairs.push_back(std::make_pair(curr_pid, ppid_and_comm.second)); - curr_pid = ppid_and_comm.first; - } - return pairs; -} - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/WeakSymbols.cpp b/plugins/tensorboard-plugins/libkineto/src/WeakSymbols.cpp deleted file mode 100644 index 540a5ac8f97..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/WeakSymbols.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include - -#ifndef _MSC_VER -extern "C" { -// This function is needed to avoid superfluous dependency on GNU OpenMP library when cuPTI is linked statically -// For more details see https://github.com/pytorch/pytorch/issues/51026 -__attribute__((weak)) int acc_get_device_type() { - throw std::runtime_error("Dummy implementation of acc_get_device_type is not supposed to be called!"); -} - -} // extern "C" -#endif diff --git a/plugins/tensorboard-plugins/libkineto/src/cupti_call.h b/plugins/tensorboard-plugins/libkineto/src/cupti_call.h deleted file mode 100644 index fd6ebae7691..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/cupti_call.h +++ /dev/null @@ -1,33 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -#ifdef HAS_CUPTI - -#include - -#define CUPTI_CALL(call) \ - [&]() -> CUptiResult { \ - CUptiResult _status_ = call; \ - if (_status_ != CUPTI_SUCCESS) { \ - const char* _errstr_ = nullptr; \ - cuptiGetResultString(_status_, &_errstr_); \ - LOG(WARNING) << fmt::format( \ - "function {} failed with error {} ({})", \ - #call, \ - _errstr_, \ - (int)_status_); \ - } \ - return _status_; \ - }() - -#define CUPTI_CALL_NOWARN(call) call - -#else - -#define CUPTI_CALL(call) call -#define CUPTI_CALL_NOWARN(call) call - -#endif // HAS_CUPTI diff --git a/plugins/tensorboard-plugins/libkineto/src/cupti_strings.cpp b/plugins/tensorboard-plugins/libkineto/src/cupti_strings.cpp deleted file mode 100644 index 4535273a277..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/cupti_strings.cpp +++ /dev/null @@ -1,502 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "cupti_strings.h" - -namespace libkineto { - -const char* memcpyKindString( - CUpti_ActivityMemcpyKind kind) { - switch (kind) { - case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: - return "HtoD"; - case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: - return "DtoH"; - case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: - return "HtoA"; - case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: - return "AtoH"; - case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: - return "AtoA"; - case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: - return "AtoD"; - case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: - return "DtoA"; - case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: - return "DtoD"; - case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: - return "HtoH"; - case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: - return "PtoP"; - default: - break; - } - return ""; -} - -const char* memoryKindString( - CUpti_ActivityMemoryKind kind) { - switch (kind) { - case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN: - return "Unknown"; - case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE: - return "Pageable"; - case CUPTI_ACTIVITY_MEMORY_KIND_PINNED: - return "Pinned"; - case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE: - return "Device"; - case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY: - return "Array"; - case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED: - return "Managed"; - case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC: - return "Device Static"; - case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC: - return "Managed Static"; - case CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT: - return "Force Int"; - default: - return "Unrecognized"; - } -} - -const char* overheadKindString( - CUpti_ActivityOverheadKind kind) { - switch (kind) { - case CUPTI_ACTIVITY_OVERHEAD_UNKNOWN: - return "Unknown"; - case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER: - return "Driver Compiler"; - case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH: - return "Buffer Flush"; - case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION: - return "Instrumentation"; - case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE: - return "Resource"; - case CUPTI_ACTIVITY_OVERHEAD_FORCE_INT: - return "Force Int"; - default: - return "Unrecognized"; - } -} - - - -static const char* runtimeCbidNames[] = { - "INVALID", - "cudaDriverGetVersion", - "cudaRuntimeGetVersion", - "cudaGetDeviceCount", - "cudaGetDeviceProperties", - "cudaChooseDevice", - "cudaGetChannelDesc", - "cudaCreateChannelDesc", - "cudaConfigureCall", - "cudaSetupArgument", - "cudaGetLastError", - "cudaPeekAtLastError", - "cudaGetErrorString", - "cudaLaunch", - "cudaFuncSetCacheConfig", - "cudaFuncGetAttributes", - "cudaSetDevice", - "cudaGetDevice", - "cudaSetValidDevices", - "cudaSetDeviceFlags", - "cudaMalloc", - "cudaMallocPitch", - "cudaFree", - "cudaMallocArray", - "cudaFreeArray", - "cudaMallocHost", - "cudaFreeHost", - "cudaHostAlloc", - "cudaHostGetDevicePointer", - "cudaHostGetFlags", - "cudaMemGetInfo", - "cudaMemcpy", - "cudaMemcpy2D", - "cudaMemcpyToArray", - "cudaMemcpy2DToArray", - "cudaMemcpyFromArray", - "cudaMemcpy2DFromArray", - "cudaMemcpyArrayToArray", - "cudaMemcpy2DArrayToArray", - "cudaMemcpyToSymbol", - "cudaMemcpyFromSymbol", - "cudaMemcpyAsync", - "cudaMemcpyToArrayAsync", - "cudaMemcpyFromArrayAsync", - "cudaMemcpy2DAsync", - "cudaMemcpy2DToArrayAsync", - "cudaMemcpy2DFromArrayAsync", - "cudaMemcpyToSymbolAsync", - "cudaMemcpyFromSymbolAsync", - "cudaMemset", - "cudaMemset2D", - "cudaMemsetAsync", - "cudaMemset2DAsync", - "cudaGetSymbolAddress", - "cudaGetSymbolSize", - "cudaBindTexture", - "cudaBindTexture2D", - "cudaBindTextureToArray", - "cudaUnbindTexture", - "cudaGetTextureAlignmentOffset", - "cudaGetTextureReference", - "cudaBindSurfaceToArray", - "cudaGetSurfaceReference", - "cudaGLSetGLDevice", - "cudaGLRegisterBufferObject", - "cudaGLMapBufferObject", - "cudaGLUnmapBufferObject", - "cudaGLUnregisterBufferObject", - "cudaGLSetBufferObjectMapFlags", - "cudaGLMapBufferObjectAsync", - "cudaGLUnmapBufferObjectAsync", - "cudaWGLGetDevice", - "cudaGraphicsGLRegisterImage", - "cudaGraphicsGLRegisterBuffer", - "cudaGraphicsUnregisterResource", - "cudaGraphicsResourceSetMapFlags", - "cudaGraphicsMapResources", - "cudaGraphicsUnmapResources", - "cudaGraphicsResourceGetMappedPointer", - "cudaGraphicsSubResourceGetMappedArray", - "cudaVDPAUGetDevice", - "cudaVDPAUSetVDPAUDevice", - "cudaGraphicsVDPAURegisterVideoSurface", - "cudaGraphicsVDPAURegisterOutputSurface", - "cudaD3D11GetDevice", - "cudaD3D11GetDevices", - "cudaD3D11SetDirect3DDevice", - "cudaGraphicsD3D11RegisterResource", - "cudaD3D10GetDevice", - "cudaD3D10GetDevices", - "cudaD3D10SetDirect3DDevice", - "cudaGraphicsD3D10RegisterResource", - "cudaD3D10RegisterResource", - "cudaD3D10UnregisterResource", - "cudaD3D10MapResources", - "cudaD3D10UnmapResources", - "cudaD3D10ResourceSetMapFlags", - "cudaD3D10ResourceGetSurfaceDimensions", - "cudaD3D10ResourceGetMappedArray", - "cudaD3D10ResourceGetMappedPointer", - "cudaD3D10ResourceGetMappedSize", - "cudaD3D10ResourceGetMappedPitch", - "cudaD3D9GetDevice", - "cudaD3D9GetDevices", - "cudaD3D9SetDirect3DDevice", - "cudaD3D9GetDirect3DDevice", - "cudaGraphicsD3D9RegisterResource", - "cudaD3D9RegisterResource", - "cudaD3D9UnregisterResource", - "cudaD3D9MapResources", - "cudaD3D9UnmapResources", - "cudaD3D9ResourceSetMapFlags", - "cudaD3D9ResourceGetSurfaceDimensions", - "cudaD3D9ResourceGetMappedArray", - "cudaD3D9ResourceGetMappedPointer", - "cudaD3D9ResourceGetMappedSize", - "cudaD3D9ResourceGetMappedPitch", - "cudaD3D9Begin", - "cudaD3D9End", - "cudaD3D9RegisterVertexBuffer", - "cudaD3D9UnregisterVertexBuffer", - "cudaD3D9MapVertexBuffer", - "cudaD3D9UnmapVertexBuffer", - "cudaThreadExit", - "cudaSetDoubleForDevice", - "cudaSetDoubleForHost", - "cudaThreadSynchronize", - "cudaThreadGetLimit", - "cudaThreadSetLimit", - "cudaStreamCreate", - "cudaStreamDestroy", - "cudaStreamSynchronize", - "cudaStreamQuery", - "cudaEventCreate", - "cudaEventCreateWithFlags", - "cudaEventRecord", - "cudaEventDestroy", - "cudaEventSynchronize", - "cudaEventQuery", - "cudaEventElapsedTime", - "cudaMalloc3D", - "cudaMalloc3DArray", - "cudaMemset3D", - "cudaMemset3DAsync", - "cudaMemcpy3D", - "cudaMemcpy3DAsync", - "cudaThreadSetCacheConfig", - "cudaStreamWaitEvent", - "cudaD3D11GetDirect3DDevice", - "cudaD3D10GetDirect3DDevice", - "cudaThreadGetCacheConfig", - "cudaPointerGetAttributes", - "cudaHostRegister", - "cudaHostUnregister", - "cudaDeviceCanAccessPeer", - "cudaDeviceEnablePeerAccess", - "cudaDeviceDisablePeerAccess", - "cudaPeerRegister", - "cudaPeerUnregister", - "cudaPeerGetDevicePointer", - "cudaMemcpyPeer", - "cudaMemcpyPeerAsync", - "cudaMemcpy3DPeer", - "cudaMemcpy3DPeerAsync", - "cudaDeviceReset", - "cudaDeviceSynchronize", - "cudaDeviceGetLimit", - "cudaDeviceSetLimit", - "cudaDeviceGetCacheConfig", - "cudaDeviceSetCacheConfig", - "cudaProfilerInitialize", - "cudaProfilerStart", - "cudaProfilerStop", - "cudaDeviceGetByPCIBusId", - "cudaDeviceGetPCIBusId", - "cudaGLGetDevices", - "cudaIpcGetEventHandle", - "cudaIpcOpenEventHandle", - "cudaIpcGetMemHandle", - "cudaIpcOpenMemHandle", - "cudaIpcCloseMemHandle", - "cudaArrayGetInfo", - "cudaFuncSetSharedMemConfig", - "cudaDeviceGetSharedMemConfig", - "cudaDeviceSetSharedMemConfig", - "cudaCreateTextureObject", - "cudaDestroyTextureObject", - "cudaGetTextureObjectResourceDesc", - "cudaGetTextureObjectTextureDesc", - "cudaCreateSurfaceObject", - "cudaDestroySurfaceObject", - "cudaGetSurfaceObjectResourceDesc", - "cudaMallocMipmappedArray", - "cudaGetMipmappedArrayLevel", - "cudaFreeMipmappedArray", - "cudaBindTextureToMipmappedArray", - "cudaGraphicsResourceGetMappedMipmappedArray", - "cudaStreamAddCallback", - "cudaStreamCreateWithFlags", - "cudaGetTextureObjectResourceViewDesc", - "cudaDeviceGetAttribute", - "cudaStreamDestroy", - "cudaStreamCreateWithPriority", - "cudaStreamGetPriority", - "cudaStreamGetFlags", - "cudaDeviceGetStreamPriorityRange", - "cudaMallocManaged", - "cudaOccupancyMaxActiveBlocksPerMultiprocessor", - "cudaStreamAttachMemAsync", - "cudaGetErrorName", - "cudaOccupancyMaxActiveBlocksPerMultiprocessor", - "cudaLaunchKernel", - "cudaGetDeviceFlags", - "cudaLaunch_ptsz", - "cudaLaunchKernel_ptsz", - "cudaMemcpy_ptds", - "cudaMemcpy2D_ptds", - "cudaMemcpyToArray_ptds", - "cudaMemcpy2DToArray_ptds", - "cudaMemcpyFromArray_ptds", - "cudaMemcpy2DFromArray_ptds", - "cudaMemcpyArrayToArray_ptds", - "cudaMemcpy2DArrayToArray_ptds", - "cudaMemcpyToSymbol_ptds", - "cudaMemcpyFromSymbol_ptds", - "cudaMemcpyAsync_ptsz", - "cudaMemcpyToArrayAsync_ptsz", - "cudaMemcpyFromArrayAsync_ptsz", - "cudaMemcpy2DAsync_ptsz", - "cudaMemcpy2DToArrayAsync_ptsz", - "cudaMemcpy2DFromArrayAsync_ptsz", - "cudaMemcpyToSymbolAsync_ptsz", - "cudaMemcpyFromSymbolAsync_ptsz", - "cudaMemset_ptds", - "cudaMemset2D_ptds", - "cudaMemsetAsync_ptsz", - "cudaMemset2DAsync_ptsz", - "cudaStreamGetPriority_ptsz", - "cudaStreamGetFlags_ptsz", - "cudaStreamSynchronize_ptsz", - "cudaStreamQuery_ptsz", - "cudaStreamAttachMemAsync_ptsz", - "cudaEventRecord_ptsz", - "cudaMemset3D_ptds", - "cudaMemset3DAsync_ptsz", - "cudaMemcpy3D_ptds", - "cudaMemcpy3DAsync_ptsz", - "cudaStreamWaitEvent_ptsz", - "cudaStreamAddCallback_ptsz", - "cudaMemcpy3DPeer_ptds", - "cudaMemcpy3DPeerAsync_ptsz", - "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", - "cudaMemPrefetchAsync", - "cudaMemPrefetchAsync_ptsz", - "cudaMemAdvise", - "cudaDeviceGetP2PAttribute", - "cudaGraphicsEGLRegisterImage", - "cudaEGLStreamConsumerConnect", - "cudaEGLStreamConsumerDisconnect", - "cudaEGLStreamConsumerAcquireFrame", - "cudaEGLStreamConsumerReleaseFrame", - "cudaEGLStreamProducerConnect", - "cudaEGLStreamProducerDisconnect", - "cudaEGLStreamProducerPresentFrame", - "cudaEGLStreamProducerReturnFrame", - "cudaGraphicsResourceGetMappedEglFrame", - "cudaMemRangeGetAttribute", - "cudaMemRangeGetAttributes", - "cudaEGLStreamConsumerConnectWithFlags", - "cudaLaunchCooperativeKernel", - "cudaLaunchCooperativeKernel_ptsz", - "cudaEventCreateFromEGLSync", - "cudaLaunchCooperativeKernelMultiDevice", - "cudaFuncSetAttribute", - "cudaImportExternalMemory", - "cudaExternalMemoryGetMappedBuffer", - "cudaExternalMemoryGetMappedMipmappedArray", - "cudaDestroyExternalMemory", - "cudaImportExternalSemaphore", - "cudaSignalExternalSemaphoresAsync", - "cudaSignalExternalSemaphoresAsync_ptsz", - "cudaWaitExternalSemaphoresAsync", - "cudaWaitExternalSemaphoresAsync_ptsz", - "cudaDestroyExternalSemaphore", - "cudaLaunchHostFunc", - "cudaLaunchHostFunc_ptsz", - "cudaGraphCreate", - "cudaGraphKernelNodeGetParams", - "cudaGraphKernelNodeSetParams", - "cudaGraphAddKernelNode", - "cudaGraphAddMemcpyNode", - "cudaGraphMemcpyNodeGetParams", - "cudaGraphMemcpyNodeSetParams", - "cudaGraphAddMemsetNode", - "cudaGraphMemsetNodeGetParams", - "cudaGraphMemsetNodeSetParams", - "cudaGraphAddHostNode", - "cudaGraphHostNodeGetParams", - "cudaGraphAddChildGraphNode", - "cudaGraphChildGraphNodeGetGraph", - "cudaGraphAddEmptyNode", - "cudaGraphClone", - "cudaGraphNodeFindInClone", - "cudaGraphNodeGetType", - "cudaGraphGetRootNodes", - "cudaGraphNodeGetDependencies", - "cudaGraphNodeGetDependentNodes", - "cudaGraphAddDependencies", - "cudaGraphRemoveDependencies", - "cudaGraphDestroyNode", - "cudaGraphInstantiate", - "cudaGraphLaunch", - "cudaGraphLaunch_ptsz", - "cudaGraphExecDestroy", - "cudaGraphDestroy", - "cudaStreamBeginCapture", - "cudaStreamBeginCapture_ptsz", - "cudaStreamIsCapturing", - "cudaStreamIsCapturing_ptsz", - "cudaStreamEndCapture", - "cudaStreamEndCapture_ptsz", - "cudaGraphHostNodeSetParams", - "cudaGraphGetNodes", - "cudaGraphGetEdges", - "cudaStreamGetCaptureInfo", - "cudaStreamGetCaptureInfo_ptsz", - "cudaGraphExecKernelNodeSetParams", - "cudaThreadExchangeStreamCaptureMode", - "cudaDeviceGetNvSciSyncAttributes", - "cudaOccupancyAvailableDynamicSMemPerBlock", - "cudaStreamSetFlags", - "cudaStreamSetFlags_ptsz", - "cudaGraphExecMemcpyNodeSetParams", - "cudaGraphExecMemsetNodeSetParams", - "cudaGraphExecHostNodeSetParams", - "cudaGraphExecUpdate", - "cudaGetFuncBySymbol", - "cudaCtxResetPersistingL2Cache", - "cudaGraphKernelNodeCopyAttributes", - "cudaGraphKernelNodeGetAttribute", - "cudaGraphKernelNodeSetAttribute", - "cudaStreamCopyAttributes", - "cudaStreamCopyAttributes_ptsz", - "cudaStreamGetAttribute", - "cudaStreamGetAttribute_ptsz", - "cudaStreamSetAttribute", - "cudaStreamSetAttribute_ptsz", - "cudaDeviceGetTexture1DLinearMaxWidth", - "cudaGraphUpload", - "cudaGraphUpload_ptsz", - "cudaGraphAddMemcpyNodeToSymbol", - "cudaGraphAddMemcpyNodeFromSymbol", - "cudaGraphAddMemcpyNode1D", - "cudaGraphMemcpyNodeSetParamsToSymbol", - "cudaGraphMemcpyNodeSetParamsFromSymbol", - "cudaGraphMemcpyNodeSetParams1D", - "cudaGraphExecMemcpyNodeSetParamsToSymbol", - "cudaGraphExecMemcpyNodeSetParamsFromSymbol", - "cudaGraphExecMemcpyNodeSetParams1D", - "cudaArrayGetSparseProperties", - "cudaMipmappedArrayGetSparseProperties", - "cudaGraphExecChildGraphNodeSetParams", - "cudaGraphAddEventRecordNode", - "cudaGraphEventRecordNodeGetEvent", - "cudaGraphEventRecordNodeSetEvent", - "cudaGraphAddEventWaitNode", - "cudaGraphEventWaitNodeGetEvent", - "cudaGraphEventWaitNodeSetEvent", - "cudaGraphExecEventRecordNodeSetEvent", - "cudaGraphExecEventWaitNodeSetEvent", - "cudaEventRecordWithFlags", - "cudaEventRecordWithFlags_ptsz", - "cudaDeviceGetDefaultMemPool", - "cudaMallocAsync", - "cudaMallocAsync_ptsz", - "cudaFreeAsync", - "cudaFreeAsync_ptsz", - "cudaMemPoolTrimTo", - "cudaMemPoolSetAttribute", - "cudaMemPoolGetAttribute", - "cudaMemPoolSetAccess", - "cudaArrayGetPlane", - "cudaMemPoolGetAccess", - "cudaMemPoolCreate", - "cudaMemPoolDestroy", - "cudaDeviceSetMemPool", - "cudaDeviceGetMemPool", - "cudaMemPoolExportToShareableHandle", - "cudaMemPoolImportFromShareableHandle", - "cudaMemPoolExportPointer", - "cudaMemPoolImportPointer", - "cudaMallocFromPoolAsync", - "cudaMallocFromPoolAsync_ptsz", - "cudaSignalExternalSemaphoresAsync", - "cudaSignalExternalSemaphoresAsync", - "cudaWaitExternalSemaphoresAsync", - "cudaWaitExternalSemaphoresAsync", - "cudaGraphAddExternalSemaphoresSignalNode", - "cudaGraphExternalSemaphoresSignalNodeGetParams", - "cudaGraphExternalSemaphoresSignalNodeSetParams", - "cudaGraphAddExternalSemaphoresWaitNode", - "cudaGraphExternalSemaphoresWaitNodeGetParams", - "cudaGraphExternalSemaphoresWaitNodeSetParams", - "cudaGraphExecExternalSemaphoresSignalNodeSetParams", - "cudaGraphExecExternalSemaphoresWaitNodeSetParams", - "SIZE" -}; - -const char* runtimeCbidName(CUpti_CallbackId cbid) { - constexpr int names_size = - sizeof(runtimeCbidNames) / sizeof(runtimeCbidNames[0]); - if (cbid < 0 || cbid >= names_size) { - return runtimeCbidNames[CUPTI_RUNTIME_TRACE_CBID_INVALID]; - } - return runtimeCbidNames[cbid]; -} - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/cupti_strings.h b/plugins/tensorboard-plugins/libkineto/src/cupti_strings.h deleted file mode 100644 index bbfebb98364..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/cupti_strings.h +++ /dev/null @@ -1,14 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -namespace libkineto { - -const char* memoryKindString(CUpti_ActivityMemoryKind kind); -const char* memcpyKindString(CUpti_ActivityMemcpyKind kind); -const char* runtimeCbidName(CUpti_CallbackId cbid); -const char* overheadKindString(CUpti_ActivityOverheadKind kind); - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/init.cpp b/plugins/tensorboard-plugins/libkineto/src/init.cpp deleted file mode 100644 index 4e1022485ac..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/init.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include - -#include "ActivityProfilerProxy.h" -#include "Config.h" -#ifdef HAS_CUPTI -#include "CuptiCallbackApi.h" -#include "CuptiActivityApi.h" -#include "EventProfilerController.h" -#endif -#include "cupti_call.h" -#include "libkineto.h" - -#include "Logger.h" - -namespace KINETO_NAMESPACE { - -#ifdef HAS_CUPTI -static bool initialized = false; -static std::mutex initMutex; - -static void initProfilers( - CUpti_CallbackDomain /*domain*/, - CUpti_CallbackId /*cbid*/, - const CUpti_CallbackData* cbInfo) { - CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo; - CUcontext ctx = d->context; - - VLOG(0) << "CUDA Context created"; - std::lock_guard lock(initMutex); - - if (!initialized) { - libkineto::api().initProfilerIfRegistered(); - initialized = true; - VLOG(0) << "libkineto profilers activated"; - } - if (getenv("KINETO_DISABLE_EVENT_PROFILER") != nullptr) { - VLOG(0) << "Event profiler disabled via env var"; - } else { - ConfigLoader& config_loader = libkineto::api().configLoader(); - config_loader.initBaseConfig(); - EventProfilerController::start(ctx, config_loader); - } -} - -// Some models suffer from excessive instrumentation code gen -// on dynamic attach which can hang for more than 5+ seconds. -// If the workload was meant to be traced, preload the CUPTI -// to take the performance hit early on. -// https://docs.nvidia.com/cupti/r_main.html#r_overhead -static bool shouldPreloadCuptiInstrumentation() { - return getenv("PRELOAD_CUPTI_INSTRUMENTATION"); -} - -static void stopProfiler( - CUpti_CallbackDomain /*domain*/, - CUpti_CallbackId /*cbid*/, - const CUpti_CallbackData* cbInfo) { - CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo; - CUcontext ctx = d->context; - - LOG(INFO) << "CUDA Context destroyed"; - std::lock_guard lock(initMutex); - EventProfilerController::stop(ctx); -} -#endif // HAS_CUPTI - -} // namespace KINETO_NAMESPACE - -// Callback interface with CUPTI and library constructors -using namespace KINETO_NAMESPACE; -extern "C" { - -// Return true if no CUPTI errors occurred during init -bool libkineto_init(bool cpuOnly, bool logOnError) { - bool success = true; -#ifdef HAS_CUPTI - if (!cpuOnly) { - // libcupti will be lazily loaded on this call. - // If it is not available (e.g. CUDA is not installed), - // then this call will return an error and we just abort init. - auto& cbapi = CuptiCallbackApi::singleton(); - bool status = false; - - if (cbapi.initSuccess()){ - const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE; - status = cbapi.registerCallback( - domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, initProfilers); - status = status && cbapi.registerCallback( - domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, stopProfiler); - - if (status) { - status = cbapi.enableCallback( - domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED); - status = status && cbapi.enableCallback( - domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED); - } - } - - if (!cbapi.initSuccess() || !status) { - success = false; - cpuOnly = true; - if (logOnError) { - CUPTI_CALL(cbapi.getCuptiStatus()); - LOG(WARNING) << "CUPTI initialization failed - " - << "CUDA profiler activities will be missing"; - LOG(INFO) << "If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to " - << "https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti"; - } - } - } - - if (shouldPreloadCuptiInstrumentation()) { - CuptiActivityApi::forceLoadCupti(); - } -#endif // HAS_CUPTI - - ConfigLoader& config_loader = libkineto::api().configLoader(); - libkineto::api().registerProfiler( - std::make_unique(cpuOnly, config_loader)); - - return success; -} - -// The cuda driver calls this function if the CUDA_INJECTION64_PATH environment -// variable is set -int InitializeInjection(void) { - LOG(INFO) << "Injection mode: Initializing libkineto"; - libkineto_init(false /*cpuOnly*/, true /*logOnError*/); - return 1; -} - -void suppressLibkinetoLogMessages() { - SET_LOG_SEVERITY_LEVEL(ERROR); -} - -} // extern C diff --git a/plugins/tensorboard-plugins/libkineto/src/libkineto_api.cpp b/plugins/tensorboard-plugins/libkineto/src/libkineto_api.cpp deleted file mode 100644 index 9a622e4f5e5..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/libkineto_api.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "libkineto.h" - -#include "ConfigLoader.h" -#include "ThreadUtil.h" - -namespace libkineto { - -LibkinetoApi& api() { - static LibkinetoApi instance(ConfigLoader::instance()); - return instance; -} - -void LibkinetoApi::initClientIfRegistered() { - if (client_) { - if (clientRegisterThread_ != threadId()) { - fprintf( - stderr, - "ERROR: External init callback must run in same thread as registerClient " - "(%d != %d)\n", - threadId(), - (int)clientRegisterThread_); - } else { - client_->init(); - } - } -} - -void LibkinetoApi::registerClient(ClientInterface* client) { - client_ = client; - if (client && activityProfiler_) { - // Can initialize straight away - client->init(); - } - // Assume here that the external init callback is *not* threadsafe - // and only call it if it's the same thread that called registerClient - clientRegisterThread_ = threadId(); -} - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/src/output_base.h b/plugins/tensorboard-plugins/libkineto/src/output_base.h deleted file mode 100644 index 29d0d57768c..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/output_base.h +++ /dev/null @@ -1,104 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include - -#ifdef HAS_CUPTI -#include -#include "CuptiActivity.h" -#endif // HAS_CUPTI -#include "ActivityBuffers.h" -#include "GenericTraceActivity.h" -#include "ThreadUtil.h" -#include "TraceSpan.h" - -namespace KINETO_NAMESPACE { - class Config; - class GpuKernelActivity; - struct RuntimeActivity; -} - -namespace libkineto { - -using namespace KINETO_NAMESPACE; - -class ActivityLogger { - public: - - virtual ~ActivityLogger() = default; - - struct DeviceInfo { - DeviceInfo(int64_t id, const std::string& name, const std::string& label) : - id(id), name(name), label(label) {} - int64_t id; - const std::string name; - const std::string label; - }; - - struct ResourceInfo { - ResourceInfo( - int64_t deviceId, - int64_t id, - int64_t sortIndex, - const std::string& name) : - id(id), sortIndex(sortIndex), deviceId(deviceId), name(name) {} - int64_t id; - int64_t sortIndex; - int64_t deviceId; - const std::string name; - }; - - struct OverheadInfo { - explicit OverheadInfo(const std::string& name) : name(name) {} - const std::string name; - }; - - virtual void handleDeviceInfo( - const DeviceInfo& info, - uint64_t time) = 0; - - virtual void handleResourceInfo(const ResourceInfo& info, int64_t time) = 0; - - virtual void handleOverheadInfo(const OverheadInfo& info, int64_t time) = 0; - - virtual void handleTraceSpan(const TraceSpan& span) = 0; - - virtual void handleActivity( - const libkineto::ITraceActivity& activity) = 0; - virtual void handleGenericActivity( - const libkineto::GenericTraceActivity& activity) = 0; - -#ifdef HAS_CUPTI - virtual void handleGpuActivity( - const GpuActivity& activity) = 0; - virtual void handleGpuActivity( - const GpuActivity& activity) = 0; - virtual void handleGpuActivity( - const GpuActivity& activity) = 0; - virtual void handleGpuActivity( - const GpuActivity& activity) = 0; -#endif // HAS_CUPTI - - virtual void handleTraceStart( - const std::unordered_map& metadata) = 0; - - void handleTraceStart() { - handleTraceStart(std::unordered_map()); - } - - virtual void finalizeTrace( - const KINETO_NAMESPACE::Config& config, - std::unique_ptr buffers, - int64_t endTime, - std::unordered_map>& metadata) = 0; - - protected: - ActivityLogger() = default; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/output_csv.cpp b/plugins/tensorboard-plugins/libkineto/src/output_csv.cpp deleted file mode 100644 index e56c0229398..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/output_csv.cpp +++ /dev/null @@ -1,88 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "output_csv.h" - -#include -#include -#include - -#include -#include - -#include "Config.h" -#include "Logger.h" - -namespace KINETO_NAMESPACE { - -static void write_header( - std::ostream& out, - const std::vector& percentiles) { - out << "timestamp,delta_ms,device,event_name"; - for (int p : percentiles) { - out << ",p" << p; - } - out << ",total" << std::endl; -} - -void EventCSVLogger::update(const Config& config) { - eventNames_.clear(); - eventNames_.insert(config.eventNames().begin(), config.eventNames().end()); - eventNames_.insert(config.metricNames().begin(), config.metricNames().end()); - if (config.percentiles() != percentiles_) { - percentiles_ = config.percentiles(); - if (out_) { - write_header(*out_, percentiles_); - } - } -} - -void EventCSVLogger::handleSample(int device, const Sample& sample, bool from_new_version) { - using namespace std::chrono; - if (out_) { - auto now = system_clock::now(); - auto time = system_clock::to_time_t(now); - for (const Stat& s : sample.stats) { - if (eventNames_.find(s.name) == eventNames_.end()) { - continue; - } - *out_ << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(time)) << ","; - *out_ << sample.deltaMsec << ","; - *out_ << device << ","; - *out_ << s.name; - for (const auto& p : s.percentileValues) { - *out_ << "," << p.second; - } - *out_ << "," << s.total << std::endl; - } - } -} - -void EventCSVFileLogger::update(const Config& config) { - if (config.eventLogFile() != filename_) { - if (of_.is_open()) { - of_.close(); - out_ = nullptr; - percentiles_.clear(); - } - filename_ = config.eventLogFile(); - if (!filename_.empty()) { - of_.open(filename_, std::ios::out | std::ios::trunc); - out_ = &of_; - } - } - EventCSVLogger::update(config); -} - -void EventCSVDbgLogger::update(const Config& config) { - if (out_ && config.verboseLogLevel() < 0) { - out_ = nullptr; - } else if (!out_ && config.verboseLogLevel() >= 0) { - out_ = &LIBKINETO_DBG_STREAM; - } - if (config.verboseLogLevel() >= 0) { - percentiles_.clear(); - EventCSVLogger::update(config); - } -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/output_csv.h b/plugins/tensorboard-plugins/libkineto/src/output_csv.h deleted file mode 100644 index bca29f4db99..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/output_csv.h +++ /dev/null @@ -1,39 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once -#include "SampleListener.h" - -#include -#include -#include - -namespace KINETO_NAMESPACE { - -class EventCSVLogger : public SampleListener { - public: - void update(const Config& config) override; - void handleSample(int device, const Sample& sample, bool from_new_version) override; - - protected: - EventCSVLogger() : out_(nullptr) {} - - std::ostream* out_; - std::set eventNames_; - std::vector percentiles_; -}; - -class EventCSVFileLogger : public EventCSVLogger { - public: - void update(const Config& config) override; - - private: - std::ofstream of_; - std::string filename_; -}; - -class EventCSVDbgLogger : public EventCSVLogger { - public: - void update(const Config& config) override; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/output_json.cpp b/plugins/tensorboard-plugins/libkineto/src/output_json.cpp deleted file mode 100644 index 0ef22339fad..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/output_json.cpp +++ /dev/null @@ -1,583 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "output_json.h" - -#include -#include -#include -#include - -#include "Config.h" -#ifdef HAS_CUPTI -#include "CuptiActivity.h" -#include "CuptiActivity.tpp" -#include "CuptiActivityApi.h" -#include "CudaDeviceProperties.h" -#endif // HAS_CUPTI -#include "Demangle.h" -#include "TraceSpan.h" - -#include "Logger.h" - -using std::endl; -using namespace libkineto; - -namespace KINETO_NAMESPACE { - -static constexpr int kSchemaVersion = 1; -static constexpr char kFlowStart = 's'; -static constexpr char kFlowEnd = 'f'; - -#ifdef __linux__ -static constexpr char kDefaultLogFileFmt[] = - "/tmp/libkineto_activities_{}.json"; -#else -static constexpr char kDefaultLogFileFmt[] = "libkineto_activities_{}.json"; -#endif - -std::string& ChromeTraceLogger::sanitizeStrForJSON(std::string& value) { -// Replace all backslashes with forward slash because Windows paths causing JSONDecodeError. -#ifdef _WIN32 - std::replace(value.begin(), value.end(), '\\', '/'); -#endif - return value; -} - -void ChromeTraceLogger::metadataToJSON( - const std::unordered_map& metadata) { - for (const auto& kv : metadata) { - traceOf_ << fmt::format(R"JSON( - "{}": {},)JSON", kv.first, kv.second); - } -} - -void ChromeTraceLogger::handleTraceStart( - const std::unordered_map& metadata) { - traceOf_ << fmt::format(R"JSON( -{{ - "schemaVersion": {},)JSON", kSchemaVersion); - -#ifdef HAS_CUPTI - traceOf_ << fmt::format(R"JSON( - "deviceProperties": [{} - ],)JSON", devicePropertiesJson()); -#endif - - metadataToJSON(metadata); - traceOf_ << R"JSON( - "traceEvents": [)JSON"; -} - -static std::string defaultFileName() { - return fmt::format(kDefaultLogFileFmt, processId()); -} - -void ChromeTraceLogger::openTraceFile() { - traceOf_.open(fileName_, std::ofstream::out | std::ofstream::trunc); - if (!traceOf_) { - PLOG(ERROR) << "Failed to open '" << fileName_ << "'"; - } else { - LOG(INFO) << "Tracing to " << fileName_; - } -} - -ChromeTraceLogger::ChromeTraceLogger(const std::string& traceFileName) { - fileName_ = traceFileName.empty() ? defaultFileName() : traceFileName; - traceOf_.clear(std::ios_base::badbit); - openTraceFile(); -} - -static int64_t us(int64_t timestamp) { - // It's important that this conversion is the same here and in the CPU trace. - // No rounding! - return timestamp / 1000; -} - -void ChromeTraceLogger::handleDeviceInfo( - const DeviceInfo& info, - uint64_t time) { - if (!traceOf_) { - return; - } - - // M is for metadata - // process_name needs a pid and a name arg - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "name": "process_name", "ph": "M", "ts": {}, "pid": {}, "tid": 0, - "args": {{ - "name": "{}" - }} - }}, - {{ - "name": "process_labels", "ph": "M", "ts": {}, "pid": {}, "tid": 0, - "args": {{ - "labels": "{}" - }} - }}, - {{ - "name": "process_sort_index", "ph": "M", "ts": {}, "pid": {}, "tid": 0, - "args": {{ - "sort_index": {} - }} - }},)JSON", - time, info.id, - info.name, - time, info.id, - info.label, - time, info.id, - info.id < 8 ? info.id + 0x1000000ll : info.id); - // clang-format on -} - -void ChromeTraceLogger::handleResourceInfo( - const ResourceInfo& info, - int64_t time) { - if (!traceOf_) { - return; - } - - // M is for metadata - // thread_name needs a pid and a name arg - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "name": "thread_name", "ph": "M", "ts": {}, "pid": {}, "tid": {}, - "args": {{ - "name": "{}" - }} - }}, - {{ - "name": "thread_sort_index", "ph": "M", "ts": {}, "pid": {}, "tid": {}, - "args": {{ - "sort_index": {} - }} - }},)JSON", - time, info.deviceId, info.id, - info.name, - time, info.deviceId, info.id, - info.sortIndex); - // clang-format on -} - -void ChromeTraceLogger::handleOverheadInfo( - const OverheadInfo& info, - int64_t time) { - if (!traceOf_) { - return; - } - - // TOOD: reserve pid = -1 for overhead but we need to rethink how to scale this for - // other metadata - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "name": "process_name", "ph": "M", "ts": {}, "pid": -1, "tid": 0, - "args": {{ - "name": "{}" - }} - }}, - {{ - "name": "process_sort_index", "ph": "M", "ts": {}, "pid": -1, "tid": 0, - "args": {{ - "sort_index": {} - }} - }},)JSON", - time, - info.name, - time, - 0x100000All); - // clang-format on -} - -void ChromeTraceLogger::handleTraceSpan(const TraceSpan& span) { - if (!traceOf_) { - return; - } - - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "X", "cat": "Trace", "ts": {}, "dur": {}, - "pid": "Spans", "tid": "{}", - "name": "{}{} ({})", - "args": {{ - "Op count": {} - }} - }}, - {{ - "name": "process_sort_index", "ph": "M", "ts": {}, - "pid": "Spans", "tid": 0, - "args": {{ - "sort_index": {} - }} - }},)JSON", - span.startTime, span.endTime - span.startTime, - span.name, - span.prefix, span.name, span.iteration, - span.opCount, - span.startTime, - // Large sort index to appear at the bottom - 0x20000000ll); - // clang-format on - - addIterationMarker(span); -} - -void ChromeTraceLogger::addIterationMarker(const TraceSpan& span) { - if (!traceOf_) { - return; - } - - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "name": "Iteration Start: {}", "ph": "i", "s": "g", - "pid": "Traces", "tid": "Trace {}", "ts": {} - }},)JSON", - span.name, - span.name, span.startTime); - // clang-format on -} - -static std::string traceActivityJson(const ITraceActivity& activity) { - // clang-format off - int64_t ts = activity.timestamp(); - int64_t duration = activity.duration(); - if (activity.type() == ActivityType::GPU_USER_ANNOTATION) { - // The GPU user annotations start at the same time as the - // first associated GPU activity. Since they appear later - // in the trace file, this causes a visualization issue in Chrome. - // Make it start one us earlier. - ts--; - duration++; // Still need it to end at the orginal point - } - return fmt::format(R"JSON( - "name": "{}", "pid": {}, "tid": {}, - "ts": {}, "dur": {})JSON", - activity.name(), activity.deviceId(), activity.resourceId(), - ts, duration); - // clang-format on -} - -void ChromeTraceLogger::handleGenericInstantEvent( - const libkineto::ITraceActivity& op) { - if (!traceOf_) { - return; - } - - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "i", "s": "t", "name": "{}", - "pid": {}, "tid": {}, - "ts": {}, - "args": {{ - {} - }} - }},)JSON", - op.name(), op.deviceId(), op.resourceId(), - op.timestamp(), op.metadataJson()); -} - -void ChromeTraceLogger::handleActivity( - const libkineto::ITraceActivity& op) { - if (!traceOf_) { - return; - } - - if (op.type() == ActivityType::CPU_INSTANT_EVENT) { - handleGenericInstantEvent(op); - return; - } - - const std::string op_metadata = op.metadataJson(); - std::string separator = ""; - if (op_metadata.find_first_not_of(" \t\n") != std::string::npos) { - separator = ",\n "; - } - std::string span = ""; - if (op.traceSpan()) { - span = fmt::format(R"JSON( - "Trace name": "{}", "Trace iteration": {},)JSON", - op.traceSpan()->name, - op.traceSpan()->iteration); - } - - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "X", "cat": "{}", {}, - "args": {{{} - "External id": {}{}{} - }} - }},)JSON", - toString(op.type()), traceActivityJson(op), - // args - span, - op.correlationId(), separator, op_metadata); - // clang-format on - if (op.flowId() > 0) { - handleGenericLink(op); - } -} - -void ChromeTraceLogger::handleGenericActivity( - const libkineto::GenericTraceActivity& op) { - handleActivity(op); -} - -void ChromeTraceLogger::handleGenericLink(const ITraceActivity& act) { - static struct { - int type; - char longName[24]; - char shortName[16]; - } flow_names[] = { - {kLinkFwdBwd, "forward_backward", "fwd_bwd"}, - {kLinkAsyncCpuGpu, "async_cpu_to_gpu", "async_gpu"} - }; - for (auto& flow : flow_names) { - if (act.flowType() == flow.type) { - // Link the activities via flow ID in source and destination. - // The source node must return true from flowStart() - // and the destination node false. - if (act.flowStart()) { - handleLink(kFlowStart, act, act.flowId(), flow.longName, flow.shortName); - } else { - handleLink(kFlowEnd, act, act.flowId(), flow.longName, flow.shortName); - } - return; - } - } - LOG(ERROR) << "Unknown flow type: " << act.flowType(); -} - -void ChromeTraceLogger::handleLink( - char type, - const ITraceActivity& e, - int64_t id, - const std::string& cat, - const std::string& name) { - if (!traceOf_) { - return; - } - - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "{}", "id": {}, "pid": {}, "tid": {}, "ts": {}, - "cat": "{}", "name": "{}", "bp": "e" - }},)JSON", - type, id, e.deviceId(), e.resourceId(), e.timestamp(), cat, name); - // clang-format on -} - -#ifdef HAS_CUPTI -// GPU side kernel activity -void ChromeTraceLogger::handleGpuActivity( - const GpuActivity& activity) { - if (!traceOf_) { - return; - } - const CUpti_ActivityKernel4* kernel = &activity.raw(); - constexpr int threads_per_warp = 32; - float blocks_per_sm = -1.0; - float warps_per_sm = -1.0; - int sm_count = smCount(kernel->deviceId); - if (sm_count) { - blocks_per_sm = - (kernel->gridX * kernel->gridY * kernel->gridZ) / (float) sm_count; - warps_per_sm = - blocks_per_sm * (kernel->blockX * kernel->blockY * kernel->blockZ) - / threads_per_warp; - } - - // Calculate occupancy - float occupancy = KINETO_NAMESPACE::kernelOccupancy( - kernel->deviceId, - kernel->registersPerThread, - kernel->staticSharedMemory, - kernel->dynamicSharedMemory, - kernel->blockX, - kernel->blockY, - kernel->blockZ, - blocks_per_sm); - - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "X", "cat": "Kernel", {}, - "args": {{ - "queued": {}, "device": {}, "context": {}, - "stream": {}, "correlation": {}, - "registers per thread": {}, - "shared memory": {}, - "blocks per SM": {}, - "warps per SM": {}, - "grid": [{}, {}, {}], - "block": [{}, {}, {}], - "est. achieved occupancy %": {} - }} - }},)JSON", - traceActivityJson(activity), - // args - us(kernel->queued), kernel->deviceId, kernel->contextId, - kernel->streamId, kernel->correlationId, - kernel->registersPerThread, - kernel->staticSharedMemory + kernel->dynamicSharedMemory, - blocks_per_sm, - warps_per_sm, - kernel->gridX, kernel->gridY, kernel->gridZ, - kernel->blockX, kernel->blockY, kernel->blockZ, - (int) (0.5 + occupancy * 100.0)); - // clang-format on - - auto to_id = activity.correlationId(); - handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); -} - -static std::string bandwidth(uint64_t bytes, uint64_t duration) { - return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); -} - -// GPU side memcpy activity -void ChromeTraceLogger::handleGpuActivity( - const GpuActivity& activity) { - if (!traceOf_) { - return; - } - const CUpti_ActivityMemcpy& memcpy = activity.raw(); - VLOG(2) << memcpy.correlationId << ": MEMCPY"; - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "X", "cat": "Memcpy", {}, - "args": {{ - "device": {}, "context": {}, - "stream": {}, "correlation": {}, - "bytes": {}, "memory bandwidth (GB/s)": {} - }} - }},)JSON", - traceActivityJson(activity), - // args - memcpy.deviceId, memcpy.contextId, - memcpy.streamId, memcpy.correlationId, - memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start)); - // clang-format on - - int64_t to_id = activity.correlationId(); - handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); -} - -// GPU side memcpy activity -void ChromeTraceLogger::handleGpuActivity( - const GpuActivity& activity) { - if (!traceOf_) { - return; - } - const CUpti_ActivityMemcpy2& memcpy = activity.raw(); - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "X", "cat": "Memcpy", {}, - "args": {{ - "fromDevice": {}, "inDevice": {}, "toDevice": {}, - "fromContext": {}, "inContext": {}, "toContext": {}, - "stream": {}, "correlation": {}, - "bytes": {}, "memory bandwidth (GB/s)": {} - }} - }},)JSON", - traceActivityJson(activity), - // args - memcpy.srcDeviceId, memcpy.deviceId, memcpy.dstDeviceId, - memcpy.srcContextId, memcpy.contextId, memcpy.dstContextId, - memcpy.streamId, memcpy.correlationId, - memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start)); - // clang-format on - - int64_t to_id = activity.correlationId(); - handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); -} - -void ChromeTraceLogger::handleGpuActivity( - const GpuActivity& activity) { - if (!traceOf_) { - return; - } - const CUpti_ActivityMemset& memset = activity.raw(); - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "X", "cat": "Memset", {}, - "args": {{ - "device": {}, "context": {}, - "stream": {}, "correlation": {}, - "bytes": {}, "memory bandwidth (GB/s)": {} - }} - }},)JSON", - traceActivityJson(activity), - // args - memset.deviceId, memset.contextId, - memset.streamId, memset.correlationId, - memset.bytes, bandwidth(memset.bytes, memset.end - memset.start)); - // clang-format on - - int64_t to_id = activity.correlationId(); - handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); -} -#endif // HAS_CUPTI - -void ChromeTraceLogger::finalizeTrace( - const Config& /*unused*/, - std::unique_ptr /*unused*/, - int64_t endTime, - std::unordered_map>& metadata) { - if (!traceOf_) { - LOG(ERROR) << "Failed to write to log file!"; - return; - } - LOG(INFO) << "Chrome Trace written to " << fileName_; - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "name": "Record Window End", "ph": "i", "s": "g", - "pid": "", "tid": "", "ts": {} - }} - ],)JSON", - endTime); - -#if !USE_GOOGLE_LOG - std::unordered_map PreparedMetadata; - for (const auto& kv : metadata) { - // Skip empty log buckets, ex. skip ERROR if its empty. - if (!kv.second.empty()) { - std::string value = "["; - // Ex. Each metadata from logger is a list of strings, expressed in JSON as - // "ERROR": ["Error 1", "Error 2"], - // "WARNING": ["Warning 1", "Warning 2", "Warning 3"], - // ... - int mdv_count = kv.second.size(); - for (const auto& v : kv.second) { - value.append("\"" + v + "\""); - if(mdv_count > 1) { - value.append(","); - mdv_count--; - } - } - value.append("]"); - PreparedMetadata[kv.first] = sanitizeStrForJSON(value); - } - } - metadataToJSON(PreparedMetadata); -#endif // !USE_GOOGLE_LOG - - // Putting this here because the last entry MUST not end with a comma. - traceOf_ << fmt::format(R"JSON( - "traceName": "{}" -}})JSON", sanitizeStrForJSON(fileName_)); - // clang-format on - - traceOf_.close(); -} - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/output_json.h b/plugins/tensorboard-plugins/libkineto/src/output_json.h deleted file mode 100644 index 5a8a81e4a9f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/output_json.h +++ /dev/null @@ -1,91 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include -#include - -#ifdef HAS_CUPTI -#include -#endif -#include "GenericTraceActivity.h" -#include "output_base.h" - -namespace KINETO_NAMESPACE { - // Previous declaration of TraceSpan is struct. Must match the same here. - struct TraceSpan; -} - -namespace KINETO_NAMESPACE { - -class Config; - -class ChromeTraceLogger : public libkineto::ActivityLogger { - public: - explicit ChromeTraceLogger(const std::string& traceFileName); - - // Note: the caller of these functions should handle concurrency - // i.e., we these functions are not thread-safe - void handleDeviceInfo( - const DeviceInfo& info, - uint64_t time) override; - - void handleOverheadInfo(const OverheadInfo& info, int64_t time) override; - - void handleResourceInfo(const ResourceInfo& info, int64_t time) override; - - void handleTraceSpan(const TraceSpan& span) override; - - void handleActivity(const ITraceActivity& activity) override; - void handleGenericActivity(const GenericTraceActivity& activity) override; - -#ifdef HAS_CUPTI - void handleGpuActivity(const GpuActivity& activity) override; - void handleGpuActivity(const GpuActivity& activity) override; - void handleGpuActivity(const GpuActivity& activity) override; - void handleGpuActivity(const GpuActivity& activity) override; -#endif // HAS_CUPTI - - void handleTraceStart( - const std::unordered_map& metadata) override; - - void finalizeTrace( - const Config& config, - std::unique_ptr buffers, - int64_t endTime, - std::unordered_map>& metadata) override; - - std::string traceFileName() const { - return fileName_; - } - - private: - - // Create a flow event (arrow) - void handleLink( - char type, - const ITraceActivity& e, - int64_t id, - const std::string& cat, - const std::string& name); - - void addIterationMarker(const TraceSpan& span); - - void openTraceFile(); - - void handleGenericInstantEvent(const ITraceActivity& op); - - void handleGenericLink(const ITraceActivity& activity); - - void metadataToJSON(const std::unordered_map& metadata); - - std::string& sanitizeStrForJSON(std::string& value); - - std::string fileName_; - std::ofstream traceOf_; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/src/output_membuf.h b/plugins/tensorboard-plugins/libkineto/src/output_membuf.h deleted file mode 100644 index ef6aadeb657..00000000000 --- a/plugins/tensorboard-plugins/libkineto/src/output_membuf.h +++ /dev/null @@ -1,130 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include -#include - -#ifdef HAS_CUPTI -#include -#endif - -#include "Config.h" -#include "GenericTraceActivity.h" -#ifdef HAS_CUPTI -#include "CuptiActivity.h" -#include "CuptiActivity.tpp" -#endif // HAS_CUPTI -#include "output_base.h" - -namespace KINETO_NAMESPACE { - -class Config; - -class MemoryTraceLogger : public ActivityLogger { - public: - MemoryTraceLogger(const Config& config) : config_(config.clone()) { - activities_.reserve(100000); - } - - // Note: the caller of these functions should handle concurrency - // i.e., these functions are not thread-safe - void handleDeviceInfo( - const DeviceInfo& info, - uint64_t time) override { - deviceInfoList_.emplace_back(info, time); - } - - void handleResourceInfo(const ResourceInfo& info, int64_t time) override { - resourceInfoList_.emplace_back(info, time); - } - - void handleOverheadInfo(const OverheadInfo& info, int64_t time) override {} - - void handleTraceSpan(const TraceSpan& span) override { - // Handled separately - } - - template - void addActivityWrapper(const T& act) { - wrappers_.push_back(std::make_unique(act)); - activities_.push_back(wrappers_.back().get()); - } - - // Just add the pointer to the list - ownership of the underlying - // objects must be transferred in ActivityBuffers via finalizeTrace - void handleActivity(const ITraceActivity& activity) override { - activities_.push_back(&activity); - } - void handleGenericActivity(const GenericTraceActivity& activity) override { - addActivityWrapper(activity); - } - -#ifdef HAS_CUPTI - void handleGpuActivity(const GpuActivity& activity) override { - addActivityWrapper(activity); - } - void handleGpuActivity(const GpuActivity& activity) override { - addActivityWrapper(activity); - } - void handleGpuActivity(const GpuActivity& activity) override { - addActivityWrapper(activity); - } - void handleGpuActivity(const GpuActivity& activity) override { - addActivityWrapper(activity); - } -#endif // HAS_CUPTI - - void handleTraceStart( - const std::unordered_map& metadata) override { - metadata_ = metadata; - } - - void finalizeTrace( - const Config& config, - std::unique_ptr buffers, - int64_t endTime, - std::unordered_map>& metadata) override { - buffers_ = std::move(buffers); - endTime_ = endTime; - } - - const std::vector* traceActivities() { - return &activities_; - } - - void log(ActivityLogger& logger) { - logger.handleTraceStart(metadata_); - for (auto& activity : activities_) { - activity->log(logger); - } - for (auto& p : deviceInfoList_) { - logger.handleDeviceInfo(p.first, p.second); - } - for (auto& p : resourceInfoList_) { - logger.handleResourceInfo(p.first, p.second); - } - for (auto& cpu_trace_buffer : buffers_->cpu) { - logger.handleTraceSpan(cpu_trace_buffer->span); - } - // Hold on to the buffers - logger.finalizeTrace(*config_, nullptr, endTime_, loggerMetadata_); - } - - private: - - std::unique_ptr config_; - // Optimization: Remove unique_ptr by keeping separate vector per type - std::vector activities_; - std::vector> wrappers_; - std::vector> deviceInfoList_; - std::vector> resourceInfoList_; - std::unique_ptr buffers_; - std::unordered_map metadata_; - std::unordered_map> loggerMetadata_; - int64_t endTime_{0}; -}; - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/test/CMakeLists.txt b/plugins/tensorboard-plugins/libkineto/test/CMakeLists.txt deleted file mode 100644 index ca54460b36c..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) - -# TODO diff --git a/plugins/tensorboard-plugins/libkineto/test/ConfigTest.cpp b/plugins/tensorboard-plugins/libkineto/test/ConfigTest.cpp deleted file mode 100644 index 16bc86e751c..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/ConfigTest.cpp +++ /dev/null @@ -1,315 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "include/Config.h" - -#include -#include -#include -#include - -using namespace std::chrono; -using namespace KINETO_NAMESPACE; - -TEST(ParseTest, Whitespace) { - Config cfg; - // Check that various types of whitespace is ignored - EXPECT_TRUE(cfg.parse("")); - EXPECT_TRUE(cfg.parse(" ")); - EXPECT_TRUE(cfg.parse("\t")); - EXPECT_TRUE(cfg.parse("\n")); - EXPECT_TRUE(cfg.parse(" ")); - EXPECT_TRUE(cfg.parse("\t \n \t\t\n\n")); - // Only the above characters are supported - EXPECT_FALSE(cfg.parse("\r\n")); -} - -TEST(ParseTest, Comment) { - Config cfg; - // Anything following a '#' should be ignored, up to a newline - EXPECT_TRUE(cfg.parse("# comment")); - EXPECT_TRUE(cfg.parse(" # ~!@#$")); - EXPECT_TRUE(cfg.parse("\t#abc")); - EXPECT_TRUE(cfg.parse("###\n##")); - EXPECT_TRUE(cfg.parse("EVENTS=util ##ok")); - EXPECT_TRUE(cfg.parse("EVENTS=util ## EVENTS=instruction")); - // Whatever appears before the comment must be valid format - EXPECT_FALSE(cfg.parse("util ## not ok")); - EXPECT_FALSE(cfg.parse("## ok \n blah # not OK")); - // Check that a comment does not affect config parsing - EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS = 1 # Sample every millisecond")); - EXPECT_EQ(cfg.samplePeriod(), milliseconds(1)); -} - -TEST(ParseTest, Format) { - Config cfg; - // The basic format is just "name = value". - // Where both value and name can be almost anything. - // Leading and trailing whitespace should be removed - // for both 'name' and 'value', but internal whitespace is not. - EXPECT_FALSE(cfg.parse("events")); - EXPECT_TRUE(cfg.parse("events=")); - EXPECT_FALSE(cfg.parse("=events=")); - EXPECT_TRUE(cfg.parse("events=1,2,3")); - // Only one setting per line - EXPECT_FALSE(cfg.parse("events = 1,2,3 ; metrics = 4,5,6")); - // Names are case sensitive - EXPECT_TRUE(cfg.parse("EVENTS = 1,2,3 \n metrics = 4,5,6")); - EXPECT_EQ(cfg.eventNames(), std::set({"1", "2", "3"})); - EXPECT_EQ(cfg.metricNames().size(), 0); - // Leading and trailing whitespace removed for event and metric names, - // but not internal. - EXPECT_TRUE( - cfg.parse("EVENTS = 1, 2, 3 \n \tMETRICS\t = \t4,\t5\t,\ts i x ")); - EXPECT_EQ(cfg.eventNames(), std::set({"1", "2", "3"})); - EXPECT_EQ(cfg.metricNames(), std::set({"4", "5", "s i x"})); -} - -TEST(ParseTest, DefaultActivityTypes) { - Config cfg; - cfg.validate(std::chrono::system_clock::now()); - auto all_activities = activityTypes(); - // TODO: introduce optional activities - EXPECT_EQ(cfg.selectedActivityTypes(), - std::set(all_activities.begin(), all_activities.end() - 1)); -} - -TEST(ParseTest, ActivityTypes) { - Config cfg; - EXPECT_FALSE(cfg.parse("ACTIVITY_TYPES")); - EXPECT_TRUE(cfg.parse("ACTIVITY_TYPES=")); - EXPECT_FALSE(cfg.parse("=ACTIVITY_TYPES=")); - - EXPECT_EQ(cfg.selectedActivityTypes(), - std::set({ActivityType::CPU_OP, - ActivityType::CPU_INSTANT_EVENT, - ActivityType::PYTHON_FUNCTION, - ActivityType::USER_ANNOTATION, - ActivityType::GPU_USER_ANNOTATION, - ActivityType::GPU_MEMCPY, - ActivityType::GPU_MEMSET, - ActivityType::CONCURRENT_KERNEL, - ActivityType::EXTERNAL_CORRELATION, - ActivityType::GLOW_RUNTIME, - ActivityType::CUDA_RUNTIME, - ActivityType::CUDA_PROFILER_RANGE})); - - Config cfg2; - EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=gpu_memcpy,gpu_MeMsEt,kernel")); - EXPECT_EQ(cfg2.selectedActivityTypes(), - std::set({ActivityType::GPU_MEMCPY, - ActivityType::GPU_MEMSET, - ActivityType::CONCURRENT_KERNEL})); - - EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cuda_Runtime,")); - EXPECT_EQ(cfg2.selectedActivityTypes(), - std::set({ActivityType::CUDA_RUNTIME})); - - // Should throw an exception because incorrect activity name - EXPECT_FALSE(cfg2.parse("ACTIVITY_TYPES = memcopy,cuda_runtime")); - - EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cpu_op")); - EXPECT_EQ(cfg2.selectedActivityTypes(), - std::set({ActivityType::CPU_OP})); -} - -TEST(ParseTest, SamplePeriod) { - Config cfg; - EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS=10")); - EXPECT_EQ(cfg.samplePeriod(), milliseconds(10)); - EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS=0")); - cfg.validate(std::chrono::system_clock::now()); - // 0 should be adjustd up to 1 - EXPECT_EQ(cfg.samplePeriod(), milliseconds(1)); - // Negative and non-int values should fail - EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=-10")); - EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=1.5")); - EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=")); - EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=string")); - EXPECT_EQ(cfg.samplePeriod(), milliseconds(1)); -} - -TEST(ParseTest, MultiplexPeriod) { - Config cfg; - auto now = std::chrono::system_clock::now(); - - EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS=100\nMULTIPLEX_PERIOD_MSECS=100")); - EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(100)); - EXPECT_TRUE(cfg.parse("MULTIPLEX_PERIOD_MSECS = 0")); - cfg.validate(now); - // Adjusted to match sample period - EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(100)); - EXPECT_TRUE(cfg.parse("MULTIPLEX_PERIOD_MSECS \t= \t 750 \n")); - cfg.validate(now); - // Adjusted to match multiple of sample period - EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(800)); - EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=-10")); - EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=1.5")); - EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=")); - EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=string")); - // Previous value not affected - EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(800)); -} - -TEST(ParseTest, ReportPeriod) { - Config cfg; - EXPECT_TRUE(cfg.parse("REPORT_PERIOD_SECS=1")); - EXPECT_EQ(cfg.reportPeriod(), seconds(1)); - // Whitespace - EXPECT_TRUE(cfg.parse("REPORT_PERIOD_SECS = \t100")); - EXPECT_EQ(cfg.reportPeriod(), seconds(100)); - // Invalid types - EXPECT_FALSE(cfg.parse("REPORT_PERIOD_SECS=-1")); - EXPECT_EQ(cfg.reportPeriod(), seconds(100)); -} - -TEST(ParseTest, SamplesPerReport) { - Config cfg; - auto now = std::chrono::system_clock::now(); - - EXPECT_TRUE(cfg.parse(R"( - SAMPLE_PERIOD_MSECS = 1000 - REPORT_PERIOD_SECS = 1 - SAMPLES_PER_REPORT = 10)")); - cfg.validate(now); - // Adjusted down to one sample per report - EXPECT_EQ(cfg.samplesPerReport(), 1); - EXPECT_TRUE(cfg.parse(R"( - SAMPLE_PERIOD_MSECS = 1000 - REPORT_PERIOD_SECS = 10 - SAMPLES_PER_REPORT = 10)")); - cfg.validate(now); - // No adjustment needed - EXPECT_EQ(cfg.samplesPerReport(), 10); - EXPECT_TRUE(cfg.parse(R"( - SAMPLE_PERIOD_MSECS = 1000 - REPORT_PERIOD_SECS = 2 - SAMPLES_PER_REPORT = 10)")); - cfg.validate(now); - // Adjusted to 2 samples per report - EXPECT_EQ(cfg.samplesPerReport(), 2); - EXPECT_TRUE(cfg.parse(R"( - SAMPLE_PERIOD_MSECS = 200 - REPORT_PERIOD_SECS = 2 - SAMPLES_PER_REPORT = 10)")); - cfg.validate(now); - // No adjustment needed - EXPECT_EQ(cfg.samplesPerReport(), 10); - EXPECT_TRUE(cfg.parse("SAMPLES_PER_REPORT=0")); - cfg.validate(now); - // Adjusted up to 1 - EXPECT_EQ(cfg.samplesPerReport(), 1); - // Invalid value types - EXPECT_FALSE(cfg.parse("SAMPLES_PER_REPORT=-10")); - EXPECT_FALSE(cfg.parse("SAMPLES_PER_REPORT=1.5")); - EXPECT_EQ(cfg.samplesPerReport(), 1); - - EXPECT_TRUE(cfg.parse(R"( - SAMPLE_PERIOD_MSECS=1000 - MULTIPLEX_PERIOD_MSECS=500 # Must be a multiple of sample period - REPORT_PERIOD_SECS=0 # Must be non-zero multiple of multiplex period - SAMPLES_PER_REPORT=5 # Max report period / multiplex period)")); - cfg.validate(now); - // Multiple adjustments - EXPECT_EQ(cfg.samplePeriod(), milliseconds(1000)); - EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(1000)); - EXPECT_EQ(cfg.reportPeriod(), seconds(1)); - EXPECT_EQ(cfg.samplesPerReport(), 1); -} - -TEST(ParseTest, EnableSigUsr2) { - Config cfg; - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=yes")); - EXPECT_TRUE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=no")); - EXPECT_FALSE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=YES")); - EXPECT_TRUE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=NO")); - EXPECT_FALSE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=Y")); - EXPECT_TRUE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=N")); - EXPECT_FALSE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=T")); - EXPECT_TRUE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=F")); - EXPECT_FALSE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=true")); - EXPECT_TRUE(cfg.sigUsr2Enabled()); - EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=false")); - EXPECT_FALSE(cfg.sigUsr2Enabled()); - EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2= ")); - EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2=2")); - EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2=-1")); - EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2=yep")); -} - -TEST(ParseTest, DeviceMask) { - Config cfg; - // Single device - EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 0")); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(0)); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(1)); - - // Two devices, internal whitespace - EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 1, 2")); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(0)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(1)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(2)); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(3)); - - // Three devices, check that previous devices are ignored - EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 0, 2,4")); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(0)); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(1)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(2)); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(3)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(4)); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(5)); - - // Repeated numbers have no effect - EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 0,1,1,1,2,3,2,1,3,7,7,3")); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(0)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(1)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(2)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(3)); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(4)); - EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(6)); - EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(7)); - - // 8 is larger than the max allowed - EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 3,8")); - - // 300 cannot be held in an uint8_t - EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 300")); - - // Various illegal cases - EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 0,1,two,three")); - EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 0,1,,2")); - EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = -1")); - EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 1.0")); -} - -TEST(ParseTest, RequestTime) { - Config cfg; - system_clock::time_point now = system_clock::now(); - int64_t tgood_ms = - duration_cast(now.time_since_epoch()).count(); - EXPECT_TRUE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tgood_ms))); - - tgood_ms = duration_cast((now - seconds(5)).time_since_epoch()) - .count(); - EXPECT_TRUE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tgood_ms))); - - int64_t tbad_ms = - duration_cast((now - seconds(20)).time_since_epoch()) - .count(); - EXPECT_FALSE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tbad_ms))); - - EXPECT_FALSE(cfg.parse("REQUEST_TIMESTAMP = 0")); - EXPECT_FALSE(cfg.parse("REQUEST_TIMESTAMP = -1")); - - tbad_ms = duration_cast((now + seconds(10)).time_since_epoch()) - .count(); - EXPECT_FALSE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tbad_ms))); -} diff --git a/plugins/tensorboard-plugins/libkineto/test/CuptiActivityProfilerTest.cpp b/plugins/tensorboard-plugins/libkineto/test/CuptiActivityProfilerTest.cpp deleted file mode 100644 index 6e67980ee31..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CuptiActivityProfilerTest.cpp +++ /dev/null @@ -1,629 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include -#include -#include -#include -#include - -#ifdef __linux__ -#include -#include -#include -#endif - -#include "include/libkineto.h" -#include "include/Config.h" -#include "src/CuptiActivityProfiler.h" -#include "src/ActivityTrace.h" -#include "src/CuptiActivityApi.h" -#include "src/output_base.h" -#include "src/output_json.h" -#include "src/output_membuf.h" - -#include "src/Logger.h" -#include "test/MockActivitySubProfiler.h" - -using namespace std::chrono; -using namespace KINETO_NAMESPACE; - -#define CUDA_LAUNCH_KERNEL CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 -#define CUDA_MEMCPY CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 - -namespace { -const TraceSpan& defaultTraceSpan() { - static TraceSpan span(0, 0, "Unknown", ""); - return span; -} -} - -// Provides ability to easily create a few test CPU-side ops -struct MockCpuActivityBuffer : public CpuTraceBuffer { - MockCpuActivityBuffer(int64_t startTime, int64_t endTime) { - span = TraceSpan(startTime, endTime,"Test trace"); - gpuOpCount = 0; - } - - void addOp(std::string name, int64_t startTime, int64_t endTime, int64_t correlation) { - GenericTraceActivity op(span, ActivityType::CPU_OP, name); - op.startTime = startTime; - op.endTime = endTime; - op.resource = systemThreadId(); - op.id = correlation; - activities.push_back(std::move(op)); - span.opCount++; - } -}; - -// Provides ability to easily create a few test CUPTI ops -struct MockCuptiActivityBuffer { - void addCorrelationActivity(int64_t correlation, CUpti_ExternalCorrelationKind externalKind, int64_t externalId) { - auto& act = *(CUpti_ActivityExternalCorrelation*) malloc(sizeof(CUpti_ActivityExternalCorrelation)); - act.kind = CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION; - act.externalId = externalId; - act.externalKind = externalKind; - act.correlationId = correlation; - activities.push_back(reinterpret_cast(&act)); - } - - void addRuntimeActivity( - CUpti_runtime_api_trace_cbid_enum cbid, - int64_t start_us, int64_t end_us, int64_t correlation) { - auto& act = createActivity( - start_us, end_us, correlation); - act.kind = CUPTI_ACTIVITY_KIND_RUNTIME; - act.cbid = cbid; - act.threadId = threadId(); - activities.push_back(reinterpret_cast(&act)); - } - - void addKernelActivity( - int64_t start_us, int64_t end_us, int64_t correlation) { - auto& act = createActivity( - start_us, end_us, correlation); - act.kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL; - act.deviceId = 0; - act.streamId = 1; - act.name = "kernel"; - act.gridX = act.gridY = act.gridZ = 1; - act.blockX = act.blockY = act.blockZ = 1; - activities.push_back(reinterpret_cast(&act)); - } - - void addMemcpyActivity( - int64_t start_us, int64_t end_us, int64_t correlation) { - auto& act = createActivity( - start_us, end_us, correlation); - act.kind = CUPTI_ACTIVITY_KIND_MEMCPY; - act.deviceId = 0; - act.streamId = 2; - act.copyKind = CUPTI_ACTIVITY_MEMCPY_KIND_HTOD; - act.srcKind = CUPTI_ACTIVITY_MEMORY_KIND_PINNED; - act.dstKind = CUPTI_ACTIVITY_MEMORY_KIND_DEVICE; - activities.push_back(reinterpret_cast(&act)); - } - - template - T& createActivity( - int64_t start_us, int64_t end_us, int64_t correlation) { - T& act = *static_cast(malloc(sizeof(T))); - bzero(&act, sizeof(act)); - act.start = start_us * 1000; - act.end = end_us * 1000; - act.correlationId = correlation; - return act; - } - - ~MockCuptiActivityBuffer() { - for (CUpti_Activity* act : activities) { - free(act); - } - } - - std::vector activities; -}; - -// Mock parts of the CuptiActivityApi -class MockCuptiActivities : public CuptiActivityApi { - public: - virtual int smCount() override { - return 10; - } - - virtual const std::pair processActivities( - CuptiActivityBufferMap&, /*unused*/ - std::function handler) override { - for (CUpti_Activity* act : activityBuffer->activities) { - handler(act); - } - return {activityBuffer->activities.size(), 100}; - } - - virtual std::unique_ptr - activityBuffers() override { - auto map = std::make_unique(); - auto buf = std::make_unique(100); - uint8_t* addr = buf->data(); - (*map)[addr] = std::move(buf); - return map; - } - - void bufferRequestedOverride(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { - this->bufferRequested(buffer, size, maxNumRecords); - } - - std::unique_ptr activityBuffer; -}; - - -// Common setup / teardown and helper functions -class CuptiActivityProfilerTest : public ::testing::Test { - protected: - void SetUp() override { - profiler_ = std::make_unique( - cuptiActivities_, /*cpu only*/ false); - cfg_ = std::make_unique(); - cfg_->validate(std::chrono::system_clock::now()); - loggerFactory.addProtocol("file", [](const std::string& url) { - return std::unique_ptr(new ChromeTraceLogger(url)); - }); - } - - std::unique_ptr cfg_; - MockCuptiActivities cuptiActivities_; - std::unique_ptr profiler_; - ActivityLoggerFactory loggerFactory; -}; - -void checkTracefile(const char* filename) { -#ifdef __linux__ - // Check that the expected file was written and that it has some content - int fd = open(filename, O_RDONLY); - if (!fd) { - perror(filename); - } - EXPECT_TRUE(fd); - // Should expect at least 100 bytes - struct stat buf{}; - fstat(fd, &buf); - EXPECT_GT(buf.st_size, 100); - close(fd); -#endif -} - -TEST(CuptiActivityProfiler, AsyncTrace) { - std::vector log_modules( - {"CuptiActivityProfiler.cpp", "output_json.cpp"}); - SET_LOG_VERBOSITY_LEVEL(1, log_modules); - - MockCuptiActivities activities; - CuptiActivityProfiler profiler(activities, /*cpu only*/ true); - - char filename[] = "/tmp/libkineto_testXXXXXX.json"; - mkstemps(filename, 5); - - Config cfg; - - int iter = 0; - int warmup = 5; - auto now = system_clock::now(); - auto startTime = now + seconds(10); - - bool success = cfg.parse(fmt::format(R"CFG( - ACTIVITIES_WARMUP_PERIOD_SECS = {} - ACTIVITIES_DURATION_SECS = 1 - ACTIVITIES_LOG_FILE = {} - PROFILE_START_TIME = {} - )CFG", warmup, filename, duration_cast(startTime.time_since_epoch()).count())); - - EXPECT_TRUE(success); - EXPECT_FALSE(profiler.isActive()); - - auto logger = std::make_unique(cfg.activitiesLogFile()); - - // Usually configuration is done when now is startTime - warmup to kick off warmup - // but start right away in the test - profiler.configure(cfg, now); - profiler.setLogger(logger.get()); - - EXPECT_TRUE(profiler.isActive()); - - // fast forward in time and we have reached the startTime - now = startTime; - - // Run the profiler - // Warmup - // performRunLoopStep is usually called by the controller loop and takes - // the current time and the controller's next wakeup time. - profiler.performRunLoopStep( - /* Current time */ now, /* Next wakeup time */ now); - - auto next = now + milliseconds(1000); - - // performRunLoopStep can also be called by an application thread to update iteration count - // since this config does not use iteration this should have no effect on the state - while (++iter < 20) { - profiler.performRunLoopStep(now, now, iter); - } - - // Runloop should now be in collect state, so start workload - // Perform another runloop step, passing in the end profile time as current. - // This should terminate collection - profiler.performRunLoopStep( - /* Current time */ next, /* Next wakeup time */ next); - // One step needed for each of the Process and Finalize phases - // Doesn't really matter what times we pass in here. - - EXPECT_TRUE(profiler.isActive()); - - auto nextnext = next + milliseconds(1000); - - while (++iter < 40) { - profiler.performRunLoopStep(next, next, iter); - } - - EXPECT_TRUE(profiler.isActive()); - - profiler.performRunLoopStep(nextnext,nextnext); - profiler.performRunLoopStep(nextnext,nextnext); - - // Assert that tracing has completed - EXPECT_FALSE(profiler.isActive()); - - checkTracefile(filename); -} - -TEST(CuptiActivityProfiler, AsyncTraceUsingIter) { - std::vector log_modules( - {"CuptiActivityProfiler.cpp", "output_json.cpp"}); - SET_LOG_VERBOSITY_LEVEL(1, log_modules); - - auto runIterTest = [&]( - int start_iter, int warmup_iters, int trace_iters) { - - LOG(INFO ) << "Async Trace Test: start_iteration = " << start_iter - << " warmup iterations = " << warmup_iters - << " trace iterations = " << trace_iters; - - MockCuptiActivities activities; - CuptiActivityProfiler profiler(activities, /*cpu only*/ true); - - char filename[] = "/tmp/libkineto_testXXXXXX.json"; - mkstemps(filename, 5); - - Config cfg; - - int iter = 0; - auto now = system_clock::now(); - - bool success = cfg.parse(fmt::format(R"CFG( - PROFILE_START_ITERATION = {} - ACTIVITIES_WARMUP_ITERATIONS={} - ACTIVITIES_ITERATIONS={} - ACTIVITIES_DURATION_SECS = 1 - ACTIVITIES_LOG_FILE = {} - )CFG", start_iter, warmup_iters, trace_iters, filename)); - - EXPECT_TRUE(success); - EXPECT_FALSE(profiler.isActive()); - - auto logger = std::make_unique(cfg.activitiesLogFile()); - - // Usually configuration is done when now is startIter - warmup iter to kick off warmup - // but start right away in the test - while (iter < (start_iter - warmup_iters)) { - profiler.performRunLoopStep(now, now, iter++); - } - - profiler.configure(cfg, now); - profiler.setLogger(logger.get()); - - EXPECT_TRUE(profiler.isActive()); - - // fast forward in time, mimicking what will happen in reality - now += seconds(10); - auto next = now + milliseconds(1000); - - // this call to runloop step should not be effecting the state - profiler.performRunLoopStep(now, next); - EXPECT_TRUE(profiler.isActive()); - - // start trace collection - while (iter < start_iter) { - profiler.performRunLoopStep(now, next, iter++); - } - - // Runloop should now be in collect state, so start workload - - while (iter < (start_iter + trace_iters)) { - profiler.performRunLoopStep(now, next, iter++); - } - - // One step is required for each of the Process and Finalize phases - // Doesn't really matter what times we pass in here. - if (iter >= (start_iter + trace_iters)) { - profiler.performRunLoopStep(now, next, iter++); - } - EXPECT_TRUE(profiler.isActive()); - - auto nextnext = next + milliseconds(1000); - - profiler.performRunLoopStep(nextnext, nextnext); - profiler.performRunLoopStep(nextnext, nextnext); - - // Assert that tracing has completed - EXPECT_FALSE(profiler.isActive()); - - checkTracefile(filename); - }; - - // start iter = 50, warmup iters = 5, trace iters = 10 - runIterTest(50, 5, 10); - // should be able to start at 0 iteration - runIterTest(0, 0, 2); - runIterTest(0, 5, 5); -} - -TEST_F(CuptiActivityProfilerTest, SyncTrace) { - using ::testing::Return; - using ::testing::ByMove; - - // Verbose logging is useful for debugging - std::vector log_modules( - {"CuptiActivityProfiler.cpp"}); - SET_LOG_VERBOSITY_LEVEL(2, log_modules); - - // Start and stop profiling - CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false); - int64_t start_time_us = 100; - int64_t duration_us = 300; - auto start_time = time_point(microseconds(start_time_us)); - profiler.configure(*cfg_, start_time); - profiler.startTrace(start_time); - profiler.stopTrace(start_time + microseconds(duration_us)); - - profiler.recordThreadInfo(); - - // Log some cpu ops - auto cpuOps = std::make_unique( - start_time_us, start_time_us + duration_us); - cpuOps->addOp("op1", 120, 150, 1); - cpuOps->addOp("op2", 130, 140, 2); - cpuOps->addOp("op3", 200, 250, 3); - profiler.transferCpuTrace(std::move(cpuOps)); - - // And some GPU ops - auto gpuOps = std::make_unique(); - gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, 133, 138, 1); - gpuOps->addRuntimeActivity(CUDA_MEMCPY, 210, 220, 2); - gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, 230, 245, 3); - gpuOps->addKernelActivity(150, 170, 1); - gpuOps->addMemcpyActivity(240, 250, 2); - gpuOps->addKernelActivity(260, 320, 3); - cuptiActivities_.activityBuffer = std::move(gpuOps); - - // Have the profiler process them - auto logger = std::make_unique(*cfg_); - profiler.processTrace(*logger); - - // Profiler can be reset at this point - logger owns the activities - profiler_->reset(); - - // Wrapper that allows iterating over the activities - ActivityTrace trace(std::move(logger), loggerFactory); - EXPECT_EQ(trace.activities()->size(), 9); - std::map activityCounts; - std::map resourceIds; - for (auto& activity : *trace.activities()) { - activityCounts[activity->name()]++; - resourceIds[activity->resourceId()]++; - } - for (const auto& p : activityCounts) { - LOG(INFO) << p.first << ": " << p.second; - } - EXPECT_EQ(activityCounts["op1"], 1); - EXPECT_EQ(activityCounts["op2"], 1); - EXPECT_EQ(activityCounts["op3"], 1); - EXPECT_EQ(activityCounts["cudaLaunchKernel"], 2); - EXPECT_EQ(activityCounts["cudaMemcpy"], 1); - EXPECT_EQ(activityCounts["kernel"], 2); - EXPECT_EQ(activityCounts["Memcpy HtoD (Pinned -> Device)"], 1); - - auto sysTid = systemThreadId(); - // Ops and runtime events are on thread sysTid - EXPECT_EQ(resourceIds[sysTid], 6); - // Kernels are on stream 1, memcpy on stream 2 - EXPECT_EQ(resourceIds[1], 2); - EXPECT_EQ(resourceIds[2], 1); - -#ifdef __linux__ - char filename[] = "/tmp/libkineto_testXXXXXX.json"; - mkstemps(filename, 5); - trace.save(filename); - // Check that the expected file was written and that it has some content - int fd = open(filename, O_RDONLY); - if (!fd) { - perror(filename); - } - EXPECT_TRUE(fd); - // Should expect at least 100 bytes - struct stat buf{}; - fstat(fd, &buf); - EXPECT_GT(buf.st_size, 100); -#endif -} - -TEST_F(CuptiActivityProfilerTest, GpuUserAnnotationTest) { - // Verbose logging is useful for debugging - std::vector log_modules( - {"CuptiActivityProfiler.cpp"}); - SET_LOG_VERBOSITY_LEVEL(2, log_modules); - - // Start and stop profiling - CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false); - int64_t start_time_us = 100; - int64_t duration_us = 300; - auto start_time = time_point(microseconds(start_time_us)); - profiler.configure(*cfg_, start_time); - profiler.startTrace(start_time); - profiler.stopTrace(start_time + microseconds(duration_us)); - - int64_t kernelLaunchTime = 120; - profiler.recordThreadInfo(); - - // set up CPU event - auto cpuOps = std::make_unique( - start_time_us, start_time_us + duration_us); - cpuOps->addOp("annotation", kernelLaunchTime, kernelLaunchTime + 10, 1); - profiler.transferCpuTrace(std::move(cpuOps)); - - // set up a couple of GPU events and correlate with above CPU event. - // CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1 is used for user annotations. - auto gpuOps = std::make_unique(); - gpuOps->addCorrelationActivity(1, CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, 1); - gpuOps->addKernelActivity(kernelLaunchTime + 5, kernelLaunchTime + 10, 1); - gpuOps->addCorrelationActivity(1, CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, 1); - gpuOps->addKernelActivity(kernelLaunchTime + 15, kernelLaunchTime + 25, 1); - cuptiActivities_.activityBuffer = std::move(gpuOps); - - // process trace - auto logger = std::make_unique(*cfg_); - profiler.processTrace(*logger); - - ActivityTrace trace(std::move(logger), loggerFactory); - std::map counts; - for (auto& activity : *trace.activities()) { - counts[activity->name()]++; - } - - // We should now have an additional annotation activity created - // on the GPU timeline. - EXPECT_EQ(counts["annotation"], 2); - EXPECT_EQ(counts["kernel"], 2); - - auto& annotation = trace.activities()->at(0); - auto& kernel1 = trace.activities()->at(1); - auto& kernel2 = trace.activities()->at(2); - auto& gpu_annotation = trace.activities()->at(3); - EXPECT_EQ(gpu_annotation->type(), ActivityType::GPU_USER_ANNOTATION); - EXPECT_EQ(gpu_annotation->timestamp(), kernel1->timestamp()); - EXPECT_EQ( - gpu_annotation->duration(), - kernel2->timestamp() + kernel2->duration() - kernel1->timestamp()); - EXPECT_EQ(gpu_annotation->deviceId(), kernel1->deviceId()); - EXPECT_EQ(gpu_annotation->resourceId(), kernel1->resourceId()); - EXPECT_EQ(gpu_annotation->correlationId(), annotation->correlationId()); - EXPECT_EQ(gpu_annotation->name(), annotation->name()); -} - -TEST_F(CuptiActivityProfilerTest, SubActivityProfilers) { - using ::testing::Return; - using ::testing::ByMove; - - // Verbose logging is useful for debugging - std::vector log_modules( - {"CuptiActivityProfiler.cpp"}); - SET_LOG_VERBOSITY_LEVEL(2, log_modules); - - // Setup example events to test - GenericTraceActivity ev{defaultTraceSpan(), ActivityType::GLOW_RUNTIME, ""}; - ev.device = 1; - ev.resource = 0; - - int64_t start_time_us = 100; - int64_t duration_us = 1000; - auto start_time = time_point(microseconds(start_time_us)); - - std::vector test_activities{3, ev}; - test_activities[0].startTime = start_time_us; - test_activities[0].endTime = start_time_us + 5000; - test_activities[0].activityName = "SubGraph A execution"; - test_activities[1].startTime = start_time_us; - test_activities[1].endTime = start_time_us + 2000; - test_activities[1].activityName = "Operator foo"; - test_activities[2].startTime = start_time_us + 2500; - test_activities[2].endTime = start_time_us + 2900; - test_activities[2].activityName = "Operator bar"; - - auto mock_activity_profiler = - std::make_unique(test_activities); - - MockCuptiActivities activities; - CuptiActivityProfiler profiler(activities, /*cpu only*/ true); - profiler.addChildActivityProfiler( - std::move(mock_activity_profiler)); - - profiler.configure(*cfg_, start_time); - profiler.startTrace(start_time); - EXPECT_TRUE(profiler.isActive()); - - profiler.stopTrace(start_time + microseconds(duration_us)); - EXPECT_TRUE(profiler.isActive()); - - char filename[] = "/tmp/libkineto_testXXXXXX.json"; - mkstemps(filename, 5); - LOG(INFO) << "Logging to tmp file " << filename; - - // process trace - auto logger = std::make_unique(*cfg_); - profiler.processTrace(*logger); - profiler.setLogger(logger.get()); - - ActivityTrace trace(std::move(logger), loggerFactory); - trace.save(filename); - const auto& traced_activites = trace.activities(); - - // Test we have all the events - EXPECT_EQ(traced_activites->size(), test_activities.size()); - - // Check that the expected file was written and that it has some content - int fd = open(filename, O_RDONLY); - if (!fd) { - perror(filename); - } - EXPECT_TRUE(fd); - - // Should expect at least 100 bytes - struct stat buf{}; - fstat(fd, &buf); - EXPECT_GT(buf.st_size, 100); -} - -TEST_F(CuptiActivityProfilerTest, BufferSizeLimitTestWarmup) { - CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false); - - auto now = system_clock::now(); - auto startTime = now + seconds(10); - - int maxBufferSizeMB = 3; - - auto startTimeEpoch = std::to_string(duration_cast(startTime.time_since_epoch()).count()); - std::string maxBufferSizeMBStr = std::to_string(maxBufferSizeMB); - cfg_->handleOption("ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB", maxBufferSizeMBStr); - cfg_->handleOption("PROFILE_START_TIME", startTimeEpoch); - - - EXPECT_FALSE(profiler.isActive()); - profiler.configure(*cfg_, now); - EXPECT_TRUE(profiler.isActive()); - - for (size_t i = 0; i < maxBufferSizeMB; i++) { - uint8_t* buf; - size_t gpuBufferSize; - size_t maxNumRecords; - cuptiActivities_.bufferRequestedOverride(&buf, &gpuBufferSize, &maxNumRecords); - } - - // fast forward to startTime and profiler is now running - now = startTime; - - profiler.performRunLoopStep(now, now); - - auto next = now + milliseconds(1000); - profiler.performRunLoopStep(next, next); - profiler.performRunLoopStep(next, next); - profiler.performRunLoopStep(next, next); - - EXPECT_FALSE(profiler.isActive()); -} diff --git a/plugins/tensorboard-plugins/libkineto/test/CuptiCallbackApiTest.cpp b/plugins/tensorboard-plugins/libkineto/test/CuptiCallbackApiTest.cpp deleted file mode 100644 index 253b696da54..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CuptiCallbackApiTest.cpp +++ /dev/null @@ -1,239 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "src/Logger.h" -#include "src/CuptiCallbackApi.h" - -#include -#include -#include -#include - -using namespace std::chrono; -using namespace KINETO_NAMESPACE; -using namespace libkineto; - -const size_t some_data = 42; - -std::atomic simple_cb_calls = 0; - -void simple_cb( - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData* cbInfo) { - - // simple arg check - EXPECT_EQ(domain, CUPTI_CB_DOMAIN_RUNTIME_API); - EXPECT_EQ(cbid, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); - EXPECT_EQ(*reinterpret_cast(cbInfo), some_data); - - simple_cb_calls++; -} - -void atomic_cb( - CUpti_CallbackDomain /*domain*/, - CUpti_CallbackId /*cbid*/, - const CUpti_CallbackData* /*cbInfo)*/) { - // do some atomics in a loop - for (int i = 0; i < 1000; i++) { - // would have used release consistency but this is fine - simple_cb_calls++; - } -} - -void empty_cb( - CUpti_CallbackDomain /*domain*/, - CUpti_CallbackId /*cbid*/, - const CUpti_CallbackData* /*cbInfo*/) { -} - -TEST(CuptiCallbackApiTest, SimpleTest) { - auto& api = CuptiCallbackApi::singleton(); - - auto addSimpleCallback = [&]() -> bool { - bool ret = api.registerCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CuptiCallbackApi::CUDA_LAUNCH_KERNEL, - &simple_cb - ); - return ret; - }; - EXPECT_TRUE(addSimpleCallback()) << "Failed to add callback"; - - // duplicate add should be okay - EXPECT_TRUE(addSimpleCallback()) << "Failed to re-add callback"; - - simple_cb_calls = 0; - - // simulate callback - api.__callback_switchboard( - CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, - reinterpret_cast(&some_data)); - - EXPECT_EQ(simple_cb_calls, 1); - - bool ret = api.deleteCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CuptiCallbackApi::CUDA_LAUNCH_KERNEL, - &simple_cb - ); - - EXPECT_TRUE(ret) << "Failed to remove callback"; - - ret = api.deleteCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CuptiCallbackApi::CUDA_LAUNCH_KERNEL, - &atomic_cb - ); - - EXPECT_FALSE(ret) << "oops! deleted a callback that was never added"; -} - -TEST(CuptiCallbackApiTest, AllCallbacks) { - auto& api = CuptiCallbackApi::singleton(); - - auto testCallback = [&]( - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - CuptiCallbackApi::CuptiCallBackID kineto_cbid) -> bool { - - bool ret = api.registerCallback(domain, kineto_cbid, atomic_cb); - EXPECT_TRUE(ret) << "Failed to add callback"; - - if (!ret) { - return false; - } - - simple_cb_calls = 0; - api.__callback_switchboard(domain, cbid, nullptr); - EXPECT_EQ(simple_cb_calls, 1000); - ret = simple_cb_calls == 1000; - - EXPECT_TRUE(api.deleteCallback(domain, kineto_cbid, atomic_cb)); - - return ret; - }; - - EXPECT_TRUE( - testCallback( - CUPTI_CB_DOMAIN_RESOURCE, - CUPTI_CBID_RESOURCE_CONTEXT_CREATED, - CuptiCallbackApi::RESOURCE_CONTEXT_CREATED)) - << "Failed to run callback for RESOURCE_CONTEXT_CREATED"; - - EXPECT_TRUE( - testCallback( - CUPTI_CB_DOMAIN_RESOURCE, - CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, - CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED)) - << "Failed to run callback for RESOURCE_CONTEXT_DESTROYED"; - - EXPECT_TRUE( - testCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, - CuptiCallbackApi::CUDA_LAUNCH_KERNEL)) - << "Failed to run callback for CUDA_LAUNCH_KERNEL"; - -} - -TEST(CuptiCallbackApiTest, ContentionTest) { - auto& api = CuptiCallbackApi::singleton(); - const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RUNTIME_API; - const CUpti_CallbackId cbid = CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000; - const CuptiCallbackApi::CuptiCallBackID kineto_cbid = - CuptiCallbackApi::CUDA_LAUNCH_KERNEL; - - bool ret = api.registerCallback(domain, kineto_cbid, empty_cb); - EXPECT_TRUE(ret) << "Failed to add callback"; - - const int iters = 10000; - const int num_readers = 8; - - simple_cb_calls = 0; - - // simulate callbacks being executed on multiple threads in parallel - // during this interval add a new atomic_callback. - // this test ensured mutual exclusion is working fine - auto read_fn = [&](int tid){ - auto start_ts = high_resolution_clock::now(); - for (int i = 0; i < iters; i++) { - api.__callback_switchboard(domain, cbid, nullptr); - } - auto runtime_ms = duration_cast( - high_resolution_clock::now() - start_ts); - LOG(INFO) << "th " << tid << " done in " << runtime_ms.count() << " ms"; - }; - - - std::vector read_ths; - for (int i = 0; i< num_readers; i++) { - read_ths.emplace_back(read_fn, i); - } - - ret = api.registerCallback(domain, kineto_cbid, atomic_cb); - EXPECT_TRUE(ret) << "Failed to add callback"; - - for (auto& t : read_ths) { - t.join(); - } - - //EXPECT_GT(simple_cb_calls, 0) - // << "Atomic callback should have been called at least once."; - - api.deleteCallback(domain, kineto_cbid, empty_cb); - api.deleteCallback(domain, kineto_cbid, atomic_cb); -} - -TEST(CuptiCallbackApiTest, Bechmark) { - - constexpr int iters = 1000; - // atomic bench a number of times to get a baseline - - const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RUNTIME_API; - const CUpti_CallbackId cbid = CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000; - const CuptiCallbackApi::CuptiCallBackID kineto_cbid = - CuptiCallbackApi::CUDA_LAUNCH_KERNEL; - - LOG(INFO) << "Iteration count = " << iters; - - const bool use_empty = true; - auto cbfn = use_empty ? &empty_cb : &atomic_cb; - - // warmup - for (int i = 0; i < 50; i++) { - (*cbfn)(domain, cbid, nullptr); - } - - auto start_ts = high_resolution_clock::now(); - for (int i = 0; i < iters; i++) { - (*cbfn)(domain, cbid, nullptr); - } - auto delta_baseline_ns = duration_cast( - high_resolution_clock::now() - start_ts); - LOG(INFO) << "Baseline runtime = " << delta_baseline_ns.count() << " ns"; - - - auto& api = CuptiCallbackApi::singleton(); - bool ret = api.registerCallback(domain, kineto_cbid, cbfn); - EXPECT_TRUE(ret) << "Failed to add callback"; - - // warmup - for (int i = 0; i < 50; i++) { - api.__callback_switchboard(domain, cbid, nullptr); - } - - start_ts = high_resolution_clock::now(); - for (int i = 0; i < iters; i++) { - api.__callback_switchboard(domain, cbid, nullptr); - } - - auto delta_callback_ns = duration_cast( - high_resolution_clock::now() - start_ts); - LOG(INFO) << "Callback runtime = " << delta_callback_ns.count() << " ns"; - - LOG(INFO) << "Callback runtime per iteration = " << - (delta_callback_ns.count() - delta_baseline_ns.count()) / (double) iters - << " ns"; - -} diff --git a/plugins/tensorboard-plugins/libkineto/test/CuptiProfilerApiTest.cu b/plugins/tensorboard-plugins/libkineto/test/CuptiProfilerApiTest.cu deleted file mode 100644 index 54ad51b0a1f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CuptiProfilerApiTest.cu +++ /dev/null @@ -1,353 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include -#include - -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "src/Logger.h" -#include "src/CuptiRangeProfilerApi.h" - -#define DRIVER_API_CALL(apiFuncCall) \ - do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - LOG(ERROR) << "Failed invoking CUDA driver function " \ - << #apiFuncCall << " status = " \ - << _status; \ - exit(-1); \ - } \ - } while (0) - -#define EXPECT(expr)\ - if (!(expr)) {\ - }; - -using namespace KINETO_NAMESPACE; - -static int numRanges = 1; - -using Type = double; - -// Device code -__global__ void VecAdd(const Type* A, const Type* B, Type* C, int N) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < N) { - C[i] = A[i] + B[i]; - } -} - -// Device code -__global__ void VecSub(const Type* A, const Type* B, Type* C, int N) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < N) { - C[i] = A[i] - B[i]; - } -} - -static void initVec(Type* vec, int n) { - for (int i = 0; i < n; i++) { - vec[i] = i; - } -} - -static void cleanUp( - Type* h_A, - Type* h_B, - Type* h_C, - Type* h_D, - Type* d_A, - Type* d_B, - Type* d_C, - Type* d_D) { - if (d_A) - cudaFree(d_A); - if (d_B) - cudaFree(d_B); - if (d_C) - cudaFree(d_C); - if (d_D) - cudaFree(d_D); - - // Free host memory - if (h_A) - free(h_A); - if (h_B) - free(h_B); - if (h_C) - free(h_C); - if (h_D) - free(h_D); -} - -/* Benchmark application used to test profiler measurements - * This simply runs two kernels vector Add and Vector Subtract - */ - -void VectorAddSubtract() { - int N = 50000; - size_t size = N * sizeof(Type); - int threadsPerBlock = 0; - int blocksPerGrid = 0; - Type *h_A, *h_B, *h_C, *h_D; - Type *d_A, *d_B, *d_C, *d_D; - int i; - Type sum, diff; - - // Allocate input vectors h_A and h_B in host memory - h_A = (Type*)malloc(size); - h_B = (Type*)malloc(size); - h_C = (Type*)malloc(size); - h_D = (Type*)malloc(size); - - // Initialize input vectors - initVec(h_A, N); - initVec(h_B, N); - memset(h_C, 0, size); - memset(h_D, 0, size); - - // Allocate vectors in device memory - cudaMalloc((void**)&d_A, size); - cudaMalloc((void**)&d_B, size); - cudaMalloc((void**)&d_C, size); - cudaMalloc((void**)&d_D, size); - - // Copy vectors from host memory to device memory - cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); - - // Invoke kernel - threadsPerBlock = 256; - blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - LOG(INFO) << fmt::format( - "Launching kernel: blocks {}, thread/block {}", - blocksPerGrid, - threadsPerBlock); - - VecAdd<<>>(d_A, d_B, d_C, N); - - VecSub<<>>(d_A, d_B, d_D, N); - - // Copy result from device memory to host memory - // h_C contains the result in host memory - cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); - cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost); - - // Verify result - for (i = 0; i < N; ++i) { - sum = h_A[i] + h_B[i]; - diff = h_A[i] - h_B[i]; - if (h_C[i] != sum || h_D[i] != diff) { - LOG(ERROR) << "Result verification failed"; - break; - } - } - - cleanUp(h_A, h_B, h_C, h_D, d_A, d_B, d_C, d_D); -} - -#if HAS_CUPTI_RANGE_PROFILER -bool runTestWithAutoRange( - int deviceNum, - const std::vector& metricNames, - CUcontext cuContext, - bool async) { - - // create a CUPTI range based profiling profiler - // this configures the counter data as well - CuptiRBProfilerSession profiler( - metricNames, deviceNum, 2, 1, async ? nullptr : cuContext); - - CUpti_ProfilerRange profilerRange = CUPTI_AutoRange; - CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_KernelReplay; - - if (async) { - profiler.asyncStartAndEnable(profilerRange, profilerReplayMode); - } else { - profiler.start(profilerRange, profilerReplayMode); - profiler.enable(); - } - - VectorAddSubtract(); - - if (!async) { - profiler.disable(); - // stop profiler - profiler.stop(); - } else { - profiler.asyncDisableAndStop(); - } - - auto result = profiler.evaluateMetrics(true); - - // check results - EXPECT_EQ(result.metricNames.size(), 3); - EXPECT_EQ(result.rangeVals.size(), 2); - - for (const auto& measurement : result.rangeVals) { - EXPECT_EQ(measurement.values.size(), 3); - - if (measurement.values.size() == 3) { - // smsp__warps_launched.avg - EXPECT_NE(measurement.values[0], 0); - // smsp__sass_thread_inst_executed_op_dadd_pred_on.sum - // each kernel has 50000 dadd ops - EXPECT_EQ(measurement.values[1], 50000); - // sm__inst_executed_pipe_tensor.sum - //EXPECT_EQ(measurement.values[2], 0); - } - } - return true; -} - -bool runTestWithUserRange( - int deviceNum, - const std::vector& metricNames, - CUcontext cuContext, - bool async = false) { - - // create a CUPTI range based profiling profiler - // this configures the counter data as well - CuptiRBProfilerSession profiler( - metricNames, deviceNum, numRanges, 1, async ? nullptr : cuContext); - - CUpti_ProfilerRange profilerRange = CUPTI_UserRange; - CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_UserReplay; - - if (async) { - profiler.asyncStartAndEnable(profilerRange, profilerReplayMode); - { VectorAddSubtract(); } - profiler.disableAndStop(); - } else { - profiler.start(profilerRange, profilerReplayMode); - - /* User takes the resposiblity of replaying the kernel launches */ - bool replay = true; - do { - profiler.beginPass(); - { - profiler.enable(); - - std::string rangeName = "vecAddSub"; - profiler.pushRange(rangeName); - - { VectorAddSubtract(); } - - profiler.popRange(); - profiler.disable(); - } - LOG(INFO) << "Replay starting."; - replay = profiler.endPass(); - - } while (!replay); - - // stop profiler - profiler.stop(); - } - VectorAddSubtract(); - auto result = profiler.evaluateMetrics(true); - - // check results - EXPECT_EQ(result.metricNames.size(), 3); - EXPECT_EQ(result.rangeVals.size(), 1); - - if (result.rangeVals.size() > 0) { - const auto& measurement = result.rangeVals[0]; - EXPECT_EQ(measurement.values.size(), 3); - - if (measurement.values.size() == 3) { - // smsp__warps_launched.avg - EXPECT_NE(measurement.values[0], 0); - // smsp__sass_thread_inst_executed_op_dadd_pred_on.sum - // in async mode multiple passes are not supported yet - if (!async) { - EXPECT_EQ(measurement.values[1], 100000); - } - // sm__inst_executed_pipe_tensor.sum - //EXPECT_EQ(measurement.values[2], 0); - } - } - return true; -} -#endif // HAS_CUPTI_RANGE_PROFILER - -int main(int argc, char* argv[]) { - - CUdevice cuDevice; - - int deviceCount, deviceNum; - int computeCapabilityMajor = 0, computeCapabilityMinor = 0; - - printf("Usage: %s [device_num]\n", argv[0]); - - DRIVER_API_CALL(cuInit(0)); - DRIVER_API_CALL(cuDeviceGetCount(&deviceCount)); - - if (deviceCount == 0) { - LOG(ERROR) << "There is no device supporting CUDA."; - return -2; - } - - if (argc > 1) - deviceNum = atoi(argv[1]); - else - deviceNum = 0; - LOG(INFO) << "CUDA Device Number: " << deviceNum; - - DRIVER_API_CALL(cuDeviceGet(&cuDevice, deviceNum)); - DRIVER_API_CALL(cuDeviceGetAttribute( - &computeCapabilityMajor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - cuDevice)); - DRIVER_API_CALL(cuDeviceGetAttribute( - &computeCapabilityMinor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - cuDevice)); - - LOG(INFO) << "Compute Cabapbility = " - << fmt::format("{},{}",computeCapabilityMajor, computeCapabilityMinor); - - if (computeCapabilityMajor < 7) { - LOG(ERROR) << "CUPTI Profiler is not supported with compute capability < 7.0"; - return -2; - } - - CuptiRBProfilerSession::staticInit(); - - // metrics to profile - std::vector metricNames = { - "smsp__warps_launched.avg", - "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum", - "sm__inst_executed_pipe_tensor.sum", - }; - - CUcontext cuContext; - DRIVER_API_CALL(cuCtxCreate(&cuContext, 0, cuDevice)); - - VectorAddSubtract(); - -#if HAS_CUPTI_RANGE_PROFILER - CuptiRBProfilerSession::staticInit(); - - if (!runTestWithUserRange(deviceNum, metricNames, cuContext, false)) { - LOG(ERROR) << "Failed to profiler test benchmark in user range"; - } else if (!runTestWithAutoRange(deviceNum, metricNames, cuContext, false)) { - LOG(ERROR) << "Failed to profiler test benchmark in auto range"; - } else if (!runTestWithUserRange(deviceNum, metricNames, cuContext, true)) { - LOG(ERROR) << "Failed to profiler test benchmark in user range async"; - } else if (!runTestWithAutoRange(deviceNum, metricNames, cuContext, true)) { - LOG(ERROR) << "Failed to profiler test benchmark in auto range async"; - } - - CuptiRBProfilerSession::deInitCupti(); -#else - LOG(WARNING) << "CuptiRBProfilerSession is not supported."; -#endif // HAS_CUPTI_RANGE_PROFILER - DRIVER_API_CALL(cuCtxDestroy(cuContext)); - - - return 0; -} diff --git a/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerApiTest.cpp b/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerApiTest.cpp deleted file mode 100644 index 28cad722c53..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerApiTest.cpp +++ /dev/null @@ -1,113 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include -#include - -#include "include/libkineto.h" -#include "include/Config.h" -#include "src/CuptiRangeProfilerApi.h" - -#include "src/Logger.h" -#include "test/CuptiRangeProfilerTestUtil.h" - -using namespace KINETO_NAMESPACE; - -#if HAS_CUPTI_PROFILER - -TEST(CuptiRangeProfilerApiTest, contextTracking) { - std::vector log_modules( - {"CuptiRangeProfilerApi.cpp"}); - SET_LOG_VERBOSITY_LEVEL(1, log_modules); - - std::array data; - std::array contexts; - for (int i = 0; i < data.size(); i++) { - contexts[i] = reinterpret_cast(&data[i]); - } - - // simulate creating contexts, this calls the trackCudaContexts - // function that would otherwise be called via a callback - uint32_t dev = 0; - for (auto ctx : contexts) { - simulateCudaContextCreate(ctx, dev++); - } - - EXPECT_EQ( - CuptiRBProfilerSession::getActiveDevices(), - std::set({0, 1, 2})); - - simulateCudaContextDestroy(contexts[1], 1); - - EXPECT_EQ( - CuptiRBProfilerSession::getActiveDevices(), - std::set({0, 2})); - - simulateCudaContextDestroy(contexts[0], 0); - simulateCudaContextDestroy(contexts[2], 2); - - EXPECT_TRUE( - CuptiRBProfilerSession::getActiveDevices().empty()); -} - -TEST(CuptiRangeProfilerApiTest, asyncLaunchUserRange) { - std::vector log_modules( - {"CuptiRangeProfilerApi.cpp"}); - SET_LOG_VERBOSITY_LEVEL(1, log_modules); - - // this is bad but the pointer is never accessed - CUcontext ctx0 = reinterpret_cast(10); - simulateCudaContextCreate(ctx0, 0 /*device_id*/); - - auto session = std::make_unique(0, ctx0); - session->asyncStartAndEnable(CUPTI_UserRange, CUPTI_UserReplay); - - simulateKernelLaunch(ctx0, "hello"); - simulateKernelLaunch(ctx0, "foo"); - simulateKernelLaunch(ctx0, "bar"); - - session->asyncDisableAndStop(); - // stop happens after next kernel is run - simulateKernelLaunch(ctx0, "bar"); - simulateCudaContextDestroy(ctx0, 0 /*device_id*/); - - EXPECT_EQ(session->passes_ended, 1); - EXPECT_EQ(session->ranges_ended, 1); - EXPECT_TRUE(session->enabled); -} - -TEST(CuptiRangeProfilerApiTest, asyncLaunchAutoRange) { - std::vector log_modules( - {"CuptiRangeProfilerApi.cpp"}); - SET_LOG_VERBOSITY_LEVEL(1, log_modules); - - // this is bad but the pointer is never accessed - CUcontext ctx0 = reinterpret_cast(10); - CUcontext ctx1 = reinterpret_cast(11); - - simulateCudaContextCreate(ctx0, 0 /*device_id*/); - - auto session = std::make_unique(0, ctx0); - session->asyncStartAndEnable(CUPTI_AutoRange, CUPTI_KernelReplay); - - simulateKernelLaunch(ctx0, "hello"); - simulateKernelLaunch(ctx0, "foo"); - simulateKernelLaunch(ctx1, "kernel_on_different_device"); - simulateKernelLaunch(ctx0, "bar"); - - session->asyncDisableAndStop(); - // stop happens after next kernel is run - simulateKernelLaunch(ctx0, "bar"); - simulateCudaContextDestroy(ctx0, 0 /*device_id*/); - - EXPECT_EQ(session->passes_ended, 0); - EXPECT_EQ(session->ranges_ended, 0); - EXPECT_TRUE(session->enabled); - - EXPECT_EQ( - session->getKernelNames(), - std::vector({"hello", "foo", "bar"})) - << "Kernel names were not tracked"; -} - -#endif // HAS_CUPTI_PROFILER diff --git a/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerConfigTest.cpp b/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerConfigTest.cpp deleted file mode 100644 index 3f568968238..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerConfigTest.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "include/Config.h" -#include "src/CuptiRangeProfilerConfig.h" - -#include -#include -#include -#include - -using namespace std::chrono; -using namespace KINETO_NAMESPACE; - -class CuptiRangeProfilerConfigTest : public ::testing::Test { - protected: - void SetUp() override { - CuptiRangeProfilerConfig::registerFactory(); - } -}; - -TEST_F(CuptiRangeProfilerConfigTest, ConfigureProfiler) { - Config cfg; - std::vector metrics = { - "kineto__cuda_core_flops", - "sm__inst_executed.sum", - "l1tex__data_bank_conflicts_pipe_lsu.sum", - }; - auto metricsConfigStr = - fmt::format("CUPTI_PROFILER_METRICS = {}", fmt::join(metrics, ",")); - - EXPECT_TRUE(cfg.parse(metricsConfigStr)); - EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = true")); - EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_MAX_RANGES = 42")); - - const CuptiRangeProfilerConfig& cupti_cfg = - CuptiRangeProfilerConfig::get(cfg); - - EXPECT_EQ(cupti_cfg.activitiesCuptiMetrics(), metrics); - EXPECT_EQ(cupti_cfg.cuptiProfilerPerKernel(), true); - EXPECT_EQ(cupti_cfg.cuptiProfilerMaxRanges(), 42); - -} - -TEST_F(CuptiRangeProfilerConfigTest, RangesDefaults) { - Config cfg, cfg_auto; - - // do not set max ranges in config, check defaults are sane - EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_METRICS = kineto__cuda_core_flops")); - EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = false")); - - cfg.setSignalDefaults(); - - EXPECT_TRUE(cfg_auto.parse("CUPTI_PROFILER_METRICS = kineto__cuda_core_flops")); - EXPECT_TRUE(cfg_auto.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = true")); - - cfg_auto.setClientDefaults(); - - int user_ranges, auto_ranges; - - user_ranges = CuptiRangeProfilerConfig::get(cfg).cuptiProfilerMaxRanges(); - auto_ranges = CuptiRangeProfilerConfig::get(cfg_auto).cuptiProfilerMaxRanges(); - - EXPECT_GE(user_ranges, 1) << " in user range mode default to at least 1 ranges"; - EXPECT_GE(auto_ranges, 1000) << " in auto range mode default to at least 1000 ranges"; - - EXPECT_GT(auto_ranges, user_ranges); -} diff --git a/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerTestUtil.h b/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerTestUtil.h deleted file mode 100644 index 861b65fd701..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CuptiRangeProfilerTestUtil.h +++ /dev/null @@ -1,96 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "CuptiRangeProfilerApi.h" - -namespace KINETO_NAMESPACE { - -#if HAS_CUPTI_PROFILER - -class MockCuptiRBProfilerSession : public CuptiRBProfilerSession { - public: - MockCuptiRBProfilerSession(int deviceId, CUcontext ctx) - : CuptiRBProfilerSession(deviceId, ctx) {} - - void beginPass() override { - LOG(INFO) << " Mock CUPTI begin pass"; - passes_started++; - } - - bool endPass() override { - passes_ended++; - return true; - } - - void flushCounterData() override {} - - void pushRange(const std::string& rangeName) override { - LOG(INFO) << " Mock CUPTI pushrange ( " << rangeName << " )"; - ranges_started++; - } - - void popRange() override { - LOG(INFO) << " Mock CUPTI poprange"; - ranges_ended++; - } - - void stop() override { - runChecks(); - } - - void enable() override { - enabled = true; - } - void disable() override {} - - CuptiProfilerResult evaluateMetrics(bool /*verbose*/) override { - return result; - } - -protected: - void startInternal( - CUpti_ProfilerRange profilerRange, - CUpti_ProfilerReplayMode profilerReplayMode) override { - curRange_ = profilerRange; - curReplay_ = profilerReplayMode; - } - -private: - void runChecks() { - EXPECT_EQ(passes_started, passes_ended); - EXPECT_EQ(ranges_started, ranges_ended); - } - - public: - int passes_started = 0; - int passes_ended = 0; - int ranges_started = 0; - int ranges_ended = 0; - bool enabled = false; - - CuptiProfilerResult result; - -}; - -inline void simulateCudaContextCreate(CUcontext context, uint32_t dev) { - testing::trackCudaCtx( - context, dev, CUPTI_CBID_RESOURCE_CONTEXT_CREATED); -} - -inline void simulateCudaContextDestroy(CUcontext context, uint32_t dev) { - testing::trackCudaCtx( - context, dev, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); -} - -inline void simulateKernelLaunch( - CUcontext context, const std::string& kernelName) { - testing::trackCudaKernelLaunch(context, kernelName.c_str()); -} - -#endif // HAS_CUPTI_PROFILER - -} // namespace KINETO_NAMESPACE diff --git a/plugins/tensorboard-plugins/libkineto/test/CuptiStringsTest.cpp b/plugins/tensorboard-plugins/libkineto/test/CuptiStringsTest.cpp deleted file mode 100644 index 405f9404a49..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/CuptiStringsTest.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include - -#include "src/cupti_strings.h" - -using namespace KINETO_NAMESPACE; - -TEST(CuptiStringsTest, Valid) { - ASSERT_STREQ( - runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_INVALID), "INVALID"); - ASSERT_STREQ( - runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020), - "cudaDriverGetVersion"); - ASSERT_STREQ(runtimeCbidName - (CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020), - "cudaDeviceSynchronize"); - ASSERT_STREQ( - runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000), - "cudaStreamSetAttribute_ptsz"); -} - -TEST(CuptiStringsTest, Invalid) { - ASSERT_STREQ(runtimeCbidName(-1), "INVALID"); - // We can't actually use CUPTI_RUNTIME_TRACE_CBID_SIZE here until we - // auto-generate the string table, since it may have more entries than - // the enum in the version used to compile. - ASSERT_STREQ(runtimeCbidName(1000), "INVALID"); -} diff --git a/plugins/tensorboard-plugins/libkineto/test/EventProfilerTest.cpp b/plugins/tensorboard-plugins/libkineto/test/EventProfilerTest.cpp deleted file mode 100644 index cb36c826a7f..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/EventProfilerTest.cpp +++ /dev/null @@ -1,578 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "src/EventProfiler.h" - -#include -#include -#include - -using namespace std::chrono; -using namespace KINETO_NAMESPACE; - -TEST(PercentileTest, Create) { - PercentileList pct = {{10, SampleValue(0)}, - {49, SampleValue(0)}, - {50, SampleValue(0)}, - {90, SampleValue(0)}}; - - percentiles({0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, pct); - EXPECT_EQ(pct[0].second.getInt(), 10); - EXPECT_EQ(pct[1].second.getInt(), 50); - EXPECT_EQ(pct[2].second.getInt(), 50); - EXPECT_EQ(pct[3].second.getInt(), 90); - - percentiles({80, 10, 20, 70, 60, 40, 90, 30, 50, 0, 100}, pct); - EXPECT_EQ(pct[0].second.getInt(), 10); - EXPECT_EQ(pct[1].second.getInt(), 50); - EXPECT_EQ(pct[2].second.getInt(), 50); - EXPECT_EQ(pct[3].second.getInt(), 90); - - percentiles({80}, pct); - EXPECT_EQ(pct[0].second.getInt(), 80); - EXPECT_EQ(pct[1].second.getInt(), 80); - EXPECT_EQ(pct[2].second.getInt(), 80); - EXPECT_EQ(pct[3].second.getInt(), 80); - - percentiles({80, 50}, pct); - EXPECT_EQ(pct[0].second.getInt(), 50); - EXPECT_EQ(pct[1].second.getInt(), 50); - EXPECT_EQ(pct[2].second.getInt(), 80); - EXPECT_EQ(pct[3].second.getInt(), 80); -} - -TEST(PercentileTest, Normalize) { - PercentileList pct = { - {10, SampleValue(10)}, {50, SampleValue(100.0)}, {90, SampleValue(2000)}}; - - normalize(pct, 2.5); - - EXPECT_EQ(pct[0].second.getInt(), 25); - EXPECT_EQ((int)pct[1].second.getDouble(), 250); - EXPECT_EQ(pct[2].second.getInt(), 5000); -} - -TEST(EventTest, SumSamples) { - Event ev; - ev.instanceCount = 4; - auto t = system_clock::now(); - ev.addSample(t, {1, 2, 3, 4}); - ev.addSample(t, {10, 20, 30, 40}); - ev.addSample(t, {100, 200, 300, 400}); - - EXPECT_EQ(ev.sumInstance(0, {0, 0, 3}), 1); - EXPECT_EQ(ev.sumInstance(0, {0, 1, 3}), 10); - EXPECT_EQ(ev.sumInstance(0, {0, 2, 3}), 100); - - EXPECT_EQ(ev.sumInstance(0, {0, 0, 1}), 111); - - EXPECT_EQ(ev.sumInstance(3, {0, 0, 1}), 444); - - // Non-zero offset - EXPECT_EQ(ev.sumInstance(0, {1, 0, 2}), 10); - EXPECT_EQ(ev.sumInstance(0, {1, 1, 2}), 100); - EXPECT_EQ(ev.sumInstance(0, {1, 0, 1}), 110); - - ev.addSample(t, {1000, 2000, 3000, 4000}); - - EXPECT_EQ(ev.sumInstance(0, {1, 0, 3}), 10); - EXPECT_EQ(ev.sumInstance(0, {1, 1, 3}), 100); - EXPECT_EQ(ev.sumInstance(0, {2, 1, 2}), 1000); - EXPECT_EQ(ev.sumInstance(0, {2, 0, 1}), 1100); - - EXPECT_EQ(ev.sumAll({0, 0, 4}), 10); - EXPECT_EQ(ev.sumAll({1, 0, 3}), 100); - EXPECT_EQ(ev.sumAll({2, 1, 2}), 10000); - EXPECT_EQ(ev.sumAll({0, 1, 2}), 11000); - EXPECT_EQ(ev.sumAll({0, 0, 1}), 11110); -} - -TEST(EventTest, Percentiles) { - Event ev; - ev.instanceCount = 4; - auto t = system_clock::now(); - ev.addSample(t, {3, 2, 1, 4}); - ev.addSample(t, {30, 20, 10, 40}); - ev.addSample(t, {300, 200, 100, 400}); - - PercentileList pct = { - {10, SampleValue(0)}, {50, SampleValue(0)}, {90, SampleValue(0)}}; - - ev.percentiles(pct, {0, 0, 3}); - EXPECT_EQ(pct[0].second.getInt(), 1); - EXPECT_EQ(pct[1].second.getInt(), 3); - EXPECT_EQ(pct[2].second.getInt(), 4); - - ev.percentiles(pct, {0, 0, 1}); - EXPECT_EQ(pct[0].second.getInt(), 111); - EXPECT_EQ(pct[1].second.getInt(), 333); - EXPECT_EQ(pct[2].second.getInt(), 444); -} - -class MockCuptiMetrics : public CuptiMetricApi { - public: - MockCuptiMetrics() : CuptiMetricApi(0) {} - MOCK_METHOD1(idFromName, CUpti_MetricID(const std::string& name)); - MOCK_METHOD1( - events, - std::map(CUpti_MetricID metric_id)); - MOCK_METHOD1(valueKind, CUpti_MetricValueKind(CUpti_MetricID metric)); - MOCK_METHOD1( - evaluationMode, - CUpti_MetricEvaluationMode(CUpti_MetricID metric)); - MOCK_METHOD5( - calculate, - SampleValue( - CUpti_MetricID metric, - CUpti_MetricValueKind kind, - std::vector& events, - std::vector& values, - int64_t duration)); -}; - -TEST(MetricTest, Calculate) { - using ::testing::Return; - MockCuptiMetrics metrics; - - // The events used for the ipc metrics: instructions and cycles - // Pretend we have 2 SMs and 2 samples of each event - Event instr("instructions"); - instr.instanceCount = 2; - auto t = system_clock::now(); - instr.addSample(t, {100, 200}); - instr.addSample(t, {300, 400}); - - Event cycles("cycles"); - cycles.instanceCount = 2; - cycles.addSample(t, {1000, 1200}); - cycles.addSample(t, {1300, 1300}); - - // 2 & 3 are the event ids we specified in the metric - std::map events; - events[2] = std::move(instr); - events[3] = std::move(cycles); - - // Define an ipc metric - EXPECT_CALL(metrics, valueKind(1)) - .Times(1) - .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); - Metric m( - "ipc", 1, {2, 3}, CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, metrics); - - // Calculate metric for first sample - // Since evaluation mode is CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, - // Cupti API will be called three times: once for each SM (2) and once - // to get the total across SMs. - std::vector ids = {2, 3}; - std::vector vals = {100, 1000}; - EXPECT_CALL( - metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) - .Times(1) - .WillOnce(Return(SampleValue(0.1))); - vals = {200, 1200}; - EXPECT_CALL( - metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) - .Times(1) - .WillOnce(Return(SampleValue(0.17))); - vals = {300, 2200}; - EXPECT_CALL( - metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) - .Times(1) - .WillOnce(Return(SampleValue(0.14))); - auto v = m.calculate(events, nanoseconds(1000), {0, 0, 2}); - - EXPECT_EQ(v.perInstance.size(), 2); - EXPECT_EQ(v.perInstance[0].getDouble(), 0.1); - EXPECT_EQ(v.perInstance[1].getDouble(), 0.17); - EXPECT_EQ(v.total.getDouble(), 0.14); - - // Calculate second sample. - // Change evaluation mode to CUPTI_METRIC_EVALUATION_MODE_AGGREGATE. - // Now we should get only one call to the Cupti API for the total. - EXPECT_CALL(metrics, valueKind(1)) - .Times(1) - .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); - Metric m2("ipc", 1, {2, 3}, CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, metrics); - vals = {700, 2600}; - EXPECT_CALL( - metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) - .Times(1) - .WillOnce(Return(SampleValue(0.27))); - v = m2.calculate(events, nanoseconds(1000), {0, 1, 2}); - - EXPECT_EQ(v.perInstance.size(), 1); - EXPECT_EQ(v.perInstance[0].getDouble(), 0.27); - EXPECT_EQ(v.total.getDouble(), 0.27); -} - -class MockCuptiEvents : public CuptiEventApi { - public: - MOCK_METHOD1( - createGroupSets, - CUpti_EventGroupSets*(std::vector& ids)); - MOCK_METHOD1(destroyGroupSets, void(CUpti_EventGroupSets* sets)); - MOCK_METHOD0(setContinuousMode, bool()); - MOCK_METHOD1(enablePerInstance, void(CUpti_EventGroup eventGroup)); - MOCK_METHOD1(instanceCount, uint32_t(CUpti_EventGroup eventGroup)); - MOCK_METHOD1(enableGroupSet, void(CUpti_EventGroupSet& set)); - MOCK_METHOD1(disableGroupSet, void(CUpti_EventGroupSet& set)); - MOCK_METHOD3( - readEvent, - void(CUpti_EventGroup g, CUpti_EventID id, std::vector& vals)); - MOCK_METHOD1(eventsInGroup, std::vector(CUpti_EventGroup g)); - MOCK_METHOD1(eventId, CUpti_EventID(const std::string& name)); -}; - -TEST(EventGroupSetTest, CollectSample) { - using ::testing::_; - using ::testing::Return; - using ::testing::SetArgPointee; - const CUpti_EventGroup g1{nullptr}; - const CUpti_EventGroup g2{reinterpret_cast(0x1000)}; - CUpti_EventGroup groups[] = {g1, g2}; - CUpti_EventGroupSet set; - set.eventGroups = groups; - set.numEventGroups = 2; - - std::map events; - Event instr("instructions"); - events[4] = std::move(instr); - Event cycles("cycles"); - events[5] = std::move(cycles); - Event branches("branches"); - events[10] = std::move(branches); - - MockCuptiEvents cupti_events; - EXPECT_CALL(cupti_events, enablePerInstance(g1)).Times(1); - EXPECT_CALL(cupti_events, enablePerInstance(g2)).Times(1); - EXPECT_CALL(cupti_events, instanceCount(g1)).Times(1).WillOnce(Return(80)); - EXPECT_CALL(cupti_events, instanceCount(g2)).Times(1).WillOnce(Return(40)); - std::vector events_in_group1 = {4, 5}; - EXPECT_CALL(cupti_events, eventsInGroup(g1)) - .Times(1) - .WillOnce(Return(events_in_group1)); - std::vector events_in_group2 = {10}; - EXPECT_CALL(cupti_events, eventsInGroup(g2)) - .Times(1) - .WillOnce(Return(events_in_group2)); - EventGroupSet group_set(set, events, cupti_events); - - EXPECT_EQ(group_set.groupCount(), 2); - EXPECT_EQ(events[4].instanceCount, 80); - EXPECT_EQ(events[5].instanceCount, 80); - EXPECT_EQ(events[10].instanceCount, 40); - - // This should not cause any Cupti API action as the group - // set is already disabled - group_set.setEnabled(false); - - // Activate group set - if activated twice, only the first - // should cause cupti API to be called - EXPECT_CALL(cupti_events, enableGroupSet(_)).Times(1); - group_set.setEnabled(false); - group_set.setEnabled(true); - - EXPECT_CALL(cupti_events, eventsInGroup(g1)) - .Times(1) - .WillOnce(Return(events_in_group1)); - EXPECT_CALL(cupti_events, eventsInGroup(g2)) - .Times(1) - .WillOnce(Return(events_in_group2)); - EXPECT_CALL(cupti_events, readEvent(g1, 4, _)).Times(1); - EXPECT_CALL(cupti_events, readEvent(g1, 5, _)).Times(1); - EXPECT_CALL(cupti_events, readEvent(g2, 10, _)).Times(1); - group_set.collectSample(); - - EXPECT_EQ(events[4].sampleCount(), 1); - EXPECT_EQ(events[5].sampleCount(), 1); - EXPECT_EQ(events[10].sampleCount(), 1); -} - -class MockLogger : public SampleListener { - public: - MOCK_METHOD3(handleSample, void(int device, const Sample& sample, bool from_new_version)); - MOCK_METHOD1(update, void(const Config& config)); -}; - -class EventProfilerTest : public ::testing::Test { - protected: - void SetUp() override { - auto cupti_events_ptr = std::make_unique(); - auto cupti_metrics_ptr = std::make_unique(); - cuptiEvents_ = cupti_events_ptr.get(); - cuptiMetrics_ = cupti_metrics_ptr.get(); - loggers_.push_back(std::make_unique()); - onDemandLoggers_.push_back(std::make_unique()); - profiler_ = std::make_unique( - std::move(cupti_events_ptr), - std::move(cupti_metrics_ptr), - loggers_, - onDemandLoggers_); - - for (int i = 0; i < kEventGroupCount; i++) { - eventGroups_[i] = &eventGroups_[i]; - } - for (int i = 0; i < kGroupSetCount; i++) { - // Default size to 1 but can be changed by test - groupSet_[i].numEventGroups = 1; - // Two groups per set - groupSet_[i].eventGroups = &eventGroups_[i * 2]; - } - groupSets_.numSets = 1; - groupSets_.sets = groupSet_; - } - - MockCuptiEvents* cuptiEvents_; - MockCuptiMetrics* cuptiMetrics_; - std::vector> loggers_; - std::vector> onDemandLoggers_; - constexpr static int kEventGroupCount = 4; - constexpr static int kGroupSetCount = 2; - CUpti_EventGroup eventGroups_[kEventGroupCount]; - CUpti_EventGroupSet groupSet_[kGroupSetCount]; - CUpti_EventGroupSets groupSets_; - std::unique_ptr profiler_; -}; - -TEST_F(EventProfilerTest, ConfigureFailure) { - using namespace testing; - - // Default config has no counters enabled. - // Check that profiler remains disabled. - Config cfg; - profiler_->configure(cfg, nullptr); - - EXPECT_FALSE(profiler_->enabled()); - - // There is no event named "cycles" - // In this case the profiler should print a warning and remain disabled - bool parsed = cfg.parse("EVENTS = cycles"); - EXPECT_TRUE(parsed); - - // EventProfiler should handle exception thrown from createGroupSets - // Configuration will be applied twice - once for combined base + on-demand - // and then again falling back to base - EXPECT_CALL(*cuptiEvents_, eventId("cycles")) - .Times(2) - .WillRepeatedly(Return(0)); - std::vector ids = {0}; - EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) - .Times(2) - .WillRepeatedly(Throw( - std::system_error(EINVAL, std::generic_category(), "Event ID"))); - profiler_->configure(cfg, nullptr); - - EXPECT_FALSE(profiler_->enabled()); -} - -TEST_F(EventProfilerTest, ConfigureBase) { - using namespace testing; - - // Test normal path, simple base config - Config cfg; - bool parsed = cfg.parse("EVENTS = elapsed_cycles_sm"); - EXPECT_TRUE(parsed); - - // One valid event - expect one call to eventId and createGroupSets - EXPECT_CALL(*cuptiEvents_, eventId("elapsed_cycles_sm")) - .Times(1) - .WillOnce(Return(5)); - std::vector ids = {5}; - EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) - .Times(1) - .WillOnce(Return(&groupSets_)); - EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[0])).Times(1); - EXPECT_CALL(*cuptiEvents_, instanceCount(eventGroups_[0])) - .Times(1) - .WillOnce(Return(80)); - EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[0])) - .Times(1) - .WillOnce(Return(ids)); - EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); - - profiler_->configure(cfg, nullptr); - - EXPECT_TRUE(profiler_->enabled()); -} - -TEST_F(EventProfilerTest, ConfigureOnDemand) { - using namespace testing; - - // Test base + on-demand config, one event and one metric - Config cfg, on_demand_cfg; - bool parsed = cfg.parse(R"( - EVENTS = active_cycles - SAMPLE_PERIOD_MSECS=500 - REPORT_PERIOD_SECS=10 - SAMPLES_PER_REPORT=5 - )"); - EXPECT_TRUE(parsed); - - parsed = on_demand_cfg.parse(R"( - METRICS = ipc - EVENTS_DURATION_SECS=60 - SAMPLE_PERIOD_MSECS=200 - MULTIPLEX_PERIOD_MSECS=2000 - REPORT_PERIOD_SECS=3 - SAMPLES_PER_REPORT=10 - )"); - EXPECT_TRUE(parsed); - - // One event - EXPECT_CALL(*cuptiEvents_, eventId("active_cycles")) - .Times(1) - .WillOnce(Return(3)); - // One metric - EXPECT_CALL(*cuptiMetrics_, idFromName("ipc")).Times(1).WillOnce(Return(10)); - std::map ipc_events; - ipc_events[4] = "instructions"; - ipc_events[5] = "elapsed_cycles_sm"; - EXPECT_CALL(*cuptiMetrics_, events(10)).Times(1).WillOnce(Return(ipc_events)); - EXPECT_CALL(*cuptiMetrics_, evaluationMode(10)) - .Times(1) - .WillOnce(Return(CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE)); - EXPECT_CALL(*cuptiMetrics_, valueKind(10)) - .Times(1) - .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); - std::vector ids = {3, 4, 5}; - groupSet_[0].numEventGroups = 2; - groupSets_.numSets = 2; - EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) - .Times(1) - .WillOnce(Return(&groupSets_)); - // Specified CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE per instance above - // So check that it's enabled - EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[0])).Times(1); - EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[1])).Times(1); - EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[2])).Times(1); - std::vector ids_g1{3}, ids_g2{4}, ids_g3{5}; - EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[0])) - .Times(1) - .WillOnce(Return(ids_g1)); - EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[1])) - .Times(1) - .WillOnce(Return(ids_g2)); - EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[2])) - .Times(1) - .WillOnce(Return(ids_g3)); - EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); - - profiler_->configure(cfg, &on_demand_cfg); - - EXPECT_TRUE(profiler_->enabled()); - EXPECT_EQ(profiler_->samplePeriod().count(), 250); - EXPECT_EQ(profiler_->multiplexPeriod().count(), 1000); - EXPECT_EQ(profiler_->reportPeriod().count(), 10000); - EXPECT_EQ(profiler_->onDemandReportPeriod().count(), 4000); -} - -TEST_F(EventProfilerTest, ReportSample) { - using namespace testing; - - // Test base + on-demand config, one event and one metric - Config cfg, on_demand_cfg; - bool parsed = cfg.parse("EVENTS = active_cycles"); - EXPECT_TRUE(parsed); - - parsed = on_demand_cfg.parse(R"( - METRICS = ipc - EVENTS_DURATION_SECS=60 - )"); - EXPECT_TRUE(parsed); - - // One event - EXPECT_CALL(*cuptiEvents_, eventId("active_cycles")) - .Times(1) - .WillOnce(Return(3)); - // One metric - EXPECT_CALL(*cuptiMetrics_, idFromName("ipc")).Times(1).WillOnce(Return(10)); - std::map ipc_events; - ipc_events[4] = "instructions"; - ipc_events[5] = "elapsed_cycles_sm"; - EXPECT_CALL(*cuptiMetrics_, events(10)).Times(1).WillOnce(Return(ipc_events)); - EXPECT_CALL(*cuptiMetrics_, evaluationMode(10)) - .Times(1) - .WillOnce(Return(CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE)); - EXPECT_CALL(*cuptiMetrics_, valueKind(10)) - .Times(1) - .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); - std::vector ids = {3, 4, 5}; - groupSet_[0].numEventGroups = 2; - groupSets_.numSets = 2; - EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) - .Times(1) - .WillOnce(Return(&groupSets_)); - EXPECT_CALL(*cuptiEvents_, instanceCount(_)) - .Times(3) - .WillRepeatedly(Return(4)); - std::vector ids_g1{3}, ids_g2{4}, ids_g3{5}; - // These will be called by collectSample() as well, which is called twice - // per group set - EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[0])) - .Times(3) - .WillRepeatedly(Return(ids_g1)); - EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[1])) - .Times(3) - .WillRepeatedly(Return(ids_g2)); - EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[2])) - .Times(3) - .WillRepeatedly(Return(ids_g3)); - EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); - - profiler_->configure(cfg, &on_demand_cfg); - - EXPECT_TRUE(profiler_->enabled()); - - EXPECT_CALL(*cuptiEvents_, readEvent(_, _, _)) - .Times(6) - .WillRepeatedly(Invoke( - [](CUpti_EventGroup g, CUpti_EventID id, std::vector& vals) { - vals = {1, 2, 3, 4}; - })); - - // Need to collect four times - twice for each group set - profiler_->collectSample(); - profiler_->collectSample(); - EXPECT_CALL(*cuptiEvents_, disableGroupSet(_)).Times(1); - EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); - profiler_->enableNextCounterSet(); - profiler_->collectSample(); - profiler_->collectSample(); - - std::vector ipc_ids = {4, 5}; - // Called once for each instance (4) and once for the total. - // x2 since we recompute per logger. - EXPECT_CALL( - *cuptiMetrics_, - calculate(10, CUPTI_METRIC_VALUE_KIND_DOUBLE, ipc_ids, _, 2000000000)) - .Times(10) - .WillRepeatedly(Return(SampleValue(0.3))); - auto& logger = dynamic_cast(*loggers_[0]); - EXPECT_CALL(logger, handleSample(0, _, _)) - .Times(1) - .WillOnce(Invoke([](int device, const Sample& sample, bool from_new_version) { - // Sample will include all stats - logger must pick the - // ones it wants. - EXPECT_EQ(sample.stats.size(), 4); - EXPECT_EQ(sample.stats[0].name, "active_cycles"); - EXPECT_EQ(sample.stats[1].name, "instructions"); - EXPECT_EQ(sample.stats[2].name, "elapsed_cycles_sm"); - EXPECT_EQ(sample.stats[3].name, "ipc"); - // 2 samples, each with values {1, 2, 3, 4} - // i.e. {2, 4, 6, 8} total - EXPECT_EQ(sample.stats[0].total.getInt(), 20); - EXPECT_EQ(sample.stats[0].percentileValues[0].second.getInt(), 2); - EXPECT_EQ(sample.stats[0].percentileValues.back().second.getInt(), 8); - // ipc is always 0.3 from mocked calculate function above - EXPECT_EQ(sample.stats[3].total.getDouble(), 0.3); - EXPECT_EQ(sample.stats[3].percentileValues[0].second.getDouble(), 0.3); - EXPECT_EQ( - sample.stats[3].percentileValues.back().second.getDouble(), 0.3); - })); - profiler_->reportSamples(); - - auto& on_demand_logger = dynamic_cast(*onDemandLoggers_[0]); - EXPECT_CALL(on_demand_logger, handleSample(0, _, _)).Times(1); - profiler_->reportOnDemandSamples(); - - EXPECT_CALL(*cuptiEvents_, disableGroupSet(_)).Times(1); -} diff --git a/plugins/tensorboard-plugins/libkineto/test/LoggerObserverTest.cpp b/plugins/tensorboard-plugins/libkineto/test/LoggerObserverTest.cpp deleted file mode 100644 index 30ba4a824af..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/LoggerObserverTest.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "include/libkineto.h" -#include "src/Logger.h" -#include "LoggerCollector.h" - -using namespace KINETO_NAMESPACE; - -#if !USE_GOOGLE_LOG - -constexpr char InfoTestStr[] = "Checking LOG(INFO)"; -constexpr char WarningTestStr[] = "Checking LOG(WARNING)"; -constexpr char ErrorTestStr[] = "Checking LOG(ERROR)"; - -TEST(LoggerObserverTest, SingleCollectorObserver) { - // Add a LoggerObserverCollector to collect all logs during the trace. - std::unique_ptr lCollector = std::make_unique(); - Logger::addLoggerObserver(lCollector.get()); - - LOG(INFO) << InfoTestStr; - LOG(WARNING) << WarningTestStr; - LOG(ERROR) << ErrorTestStr; - - auto LoggerMD = lCollector->extractCollectorMetadata(); - EXPECT_TRUE(LoggerMD[LoggerOutputType::INFO][0].find(InfoTestStr) != std::string::npos); - EXPECT_TRUE(LoggerMD[LoggerOutputType::WARNING][0].find(WarningTestStr) != std::string::npos); - EXPECT_TRUE(LoggerMD[LoggerOutputType::ERROR][0].find(ErrorTestStr) != std::string::npos); - - Logger::removeLoggerObserver(lCollector.get()); -} - -#define NUM_OF_MESSAGES_FOR_EACH_TYPE 10 -#define NUM_OF_WRITE_THREADS 200 - -// Writes NUM_OF_MESSAGES_FOR_EACH_TYPE messages for each INFO, WARNING, and ERROR. -// NOLINTNEXTLINE(clang-diagnostic-unused-parameter) -void* writeSeveralMessages(void* ptr) { - for(int i=0; i lc1 = std::make_unique(); - std::unique_ptr lc2 = std::make_unique(); - std::unique_ptr lc3 = std::make_unique(); - std::unique_ptr lc4 = std::make_unique(); - Logger::addLoggerObserver(lc1.get()); - Logger::addLoggerObserver(lc2.get()); - Logger::addLoggerObserver(lc3.get()); - Logger::addLoggerObserver(lc4.get()); - - // Launch NUM_OF_WRITE_THREADS threads writing several messages. - pthread_t ListOfThreads[NUM_OF_WRITE_THREADS]; - for (int i=0; iextractCollectorMetadata(); - int InfoCount = 0, WarnCount = 0, ErrorCount = 0; - for (auto& md : lc1MD) { - InfoCount += md.first == LoggerOutputType::INFO ? md.second.size() : 0; - WarnCount += md.first == LoggerOutputType::WARNING ? md.second.size() : 0; - ErrorCount += md.first == LoggerOutputType::ERROR ? md.second.size() : 0; - } - - EXPECT_EQ(InfoCount, NUM_OF_WRITE_THREADS * NUM_OF_MESSAGES_FOR_EACH_TYPE); - EXPECT_EQ(WarnCount, NUM_OF_WRITE_THREADS * NUM_OF_MESSAGES_FOR_EACH_TYPE); - EXPECT_EQ(ErrorCount, NUM_OF_WRITE_THREADS * NUM_OF_MESSAGES_FOR_EACH_TYPE); - - Logger::removeLoggerObserver(lc1.get()); - Logger::removeLoggerObserver(lc2.get()); - Logger::removeLoggerObserver(lc3.get()); - Logger::removeLoggerObserver(lc4.get()); -} - -#endif // !USE_GOOGLE_LOG - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.cpp b/plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.cpp deleted file mode 100644 index 89f1d536ca8..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include -#include -#include - -#include "test/MockActivitySubProfiler.h" - -namespace libkineto { - -const std::set supported_activities {ActivityType::CPU_OP}; -const std::string profile_name{"MockProfiler"}; - -void MockProfilerSession::processTrace(ActivityLogger& logger) { - for (const auto& activity: activities()) { - activity.log(logger); - } -} - -const std::string& MockActivityProfiler::name() const { - return profile_name; -} - -const std::set& MockActivityProfiler::availableActivities() const { - return supported_activities; -} - -MockActivityProfiler::MockActivityProfiler( - std::vector& activities) : - test_activities_(activities) {}; - -std::unique_ptr MockActivityProfiler::configure( - const std::set& /*activity_types*/, - const Config& /*config*/) { - auto session = std::make_unique(); - session->set_test_activities(std::move(test_activities_)); - return session; -}; - -std::unique_ptr MockActivityProfiler::configure( - int64_t /*ts_ms*/, - int64_t /*duration_ms*/, - const std::set& activity_types, - const Config& config) { - return configure(activity_types, config); -}; - -} // namespace libkineto - diff --git a/plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.h b/plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.h deleted file mode 100644 index 36eaa13d1a5..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/MockActivitySubProfiler.h +++ /dev/null @@ -1,72 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include -#include -#include - -#include "include/IActivityProfiler.h" - -namespace libkineto { - -class MockProfilerSession: public IActivityProfilerSession { - - public: - explicit MockProfilerSession() {} - - void start() override { - start_count++; - status_ = TraceStatus::RECORDING; - } - - void stop() override { - stop_count++; - status_ = TraceStatus::PROCESSING; - } - - std::vector& activities() override { - return test_activities_; - } - - std::vector errors() override { - return {}; - } - - void processTrace(ActivityLogger& logger) override; - - void set_test_activities(std::vector&& acs) { - test_activities_ = std::move(acs); - } - - int start_count = 0; - int stop_count = 0; - private: - std::vector test_activities_; -}; - - -class MockActivityProfiler: public IActivityProfiler { - - public: - explicit MockActivityProfiler(std::vector& activities); - - const std::string& name() const override; - - const std::set& availableActivities() const override; - - std::unique_ptr configure( - const std::set& activity_types, - const Config& config) override; - - std::unique_ptr configure( - int64_t ts_ms, - int64_t duration_ms, - const std::set& activity_types, - const Config& config) override; - - private: - std::vector test_activities_; -}; - -} // namespace libkineto diff --git a/plugins/tensorboard-plugins/libkineto/test/PidInfoTest.cpp b/plugins/tensorboard-plugins/libkineto/test/PidInfoTest.cpp deleted file mode 100644 index b86cfb36d05..00000000000 --- a/plugins/tensorboard-plugins/libkineto/test/PidInfoTest.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#include "include/ThreadUtil.h" - -#include -#include - -#include -#include - -using namespace KINETO_NAMESPACE; - -TEST(ThreadNameTest, setAndGet) { - setThreadName("ThreadNameTest"); - EXPECT_EQ(getThreadName(), "ThreadNameTest"); - - setThreadName(""); - EXPECT_EQ(getThreadName(), ""); - - // Spaces etc are ok - setThreadName("Name w/ spaces"); - EXPECT_EQ(getThreadName(), "Name w/ spaces"); - - // More than 16 chars is not OK - setThreadName("More than 16 characters"); - EXPECT_EQ(getThreadName(), "Name w/ spaces"); -} -- Gitee From 23555cedf9cd188f886ca1232281946c50573d28 Mon Sep 17 00:00:00 2001 From: zhouxianqi <13165993773@163.com> Date: Thu, 1 Aug 2024 09:54:49 +0800 Subject: [PATCH 08/94] update_profiler_poc_code --- .idea/.gitignore | 3 + .idea/att.iml | 18 ++ .idea/inspectionProfiles/Project_Default.xml | 19 ++ .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + profiler/README.md | 1 + profiler/advisor/README.md | 11 +- profiler/advisor/analyzer/base_analyzer.py | 22 +- .../analyzer/cluster/slow_link_analyser.py | 18 +- .../analyzer/cluster/slow_rank_analyser.py | 6 +- .../computation/ai_core_freq}/__init__.py | 0 .../ai_core_freq/ai_core_freq_analyzer.py | 36 +++ .../ai_core_freq/ai_core_freq_checker.py | 100 +++++++ .../computation/aicpu/aicpu_checker.py | 6 +- .../computation/bound/block_dim_checker.py | 1 - .../computation/profiling_analyzer.py | 9 +- .../dataloader/dataloader_analyzer.py | 30 +++ .../analyzer/dataloader/dataloader_checker.py | 84 ++++++ .../graph_fusion/graph_fusion_checker.py | 2 +- .../analyzer/overall/overall_analyzer.py | 45 ---- .../overall/overall_summary_analyzer.py | 240 ++++++++--------- .../analyzer/schedule/syncbn}/__init__.py | 0 .../schedule/syncbn/syncbn_analyzer.py | 30 +++ .../schedule/syncbn/syncbn_checker.py | 70 +++++ .../schedule/synchronize_stream}/__init__.py | 0 .../synchronize_stream_analyzer.py | 32 +++ .../synchronize_stream_checker.py | 89 ++++++ .../schedule/timeline_base_checker.py | 91 +++++++ profiler/advisor/common/analyzer_scopes.py | 4 + profiler/advisor/common/constant.py | 9 +- profiler/advisor/common/graph/graph_parser.py | 9 +- profiler/advisor/common/profiling/ge_info.py | 3 +- profiler/advisor/common/profiling/msprof.py | 3 +- .../advisor/common/profiling/op_summary.py | 4 +- profiler/advisor/common/profiling/tasktime.py | 4 +- profiler/advisor/common/timeline/event.py | 5 +- .../advisor/common/timeline/fusion_ops_db.py | 6 +- profiler/advisor/config/config.ini | 1 + profiler/advisor/config/config.py | 7 + .../config/profiling_data_version_config.yaml | 17 +- .../dataset/ai_core_freq}/__init__.py | 0 .../ai_core_freq/ai_core_freq_dataset.py | 148 ++++++++++ .../dataset/cluster/cluster_dataset.py | 14 +- .../advisor/dataset/profiling/device_info.py | 2 + .../dataset/profiling/profiling_dataset.py | 15 +- .../dataset/profiling/profiling_parser.py | 27 +- .../advisor/dataset/timeline_event_dataset.py | 165 ++++++++++-- profiler/advisor/display/html/render.py | 5 +- .../html/templates/ai_core_frequency.html | 27 ++ .../html/templates/slow_dataloader.html | 18 ++ .../html/templates/sync_batchnorm.html | 30 +++ .../html/templates/synchronize_stream.html | 57 ++++ profiler/advisor/img/overall.png | Bin 64492 -> 49616 bytes profiler/advisor/img/overall_0.png | Bin 0 -> 56377 bytes profiler/advisor/interface/interface.py | 18 +- profiler/advisor/result/item.py | 2 +- profiler/advisor/result/result.py | 18 +- profiler/advisor/rules/dataloader.yaml | 9 + profiler/advisor/rules/sync_batchnorm.yaml | 41 +++ profiler/advisor/rules/synchronize.yaml | 8 + profiler/advisor/utils/utils.py | 64 ++++- profiler/cli/__init__.py | 2 +- profiler/cli/analyze_cli.py | 3 - profiler/cli/compare_cli.py | 2 + profiler/cluster_analyse/README.md | 12 +- .../common_func/file_manager.py | 19 ++ profiler/compare_tools/README.md | 84 +++++- .../comparator/api_compare_comparator.py | 32 +++ .../comparator/base_comparator.py | 2 +- .../comparator/kernel_compare_comparator.py | 35 +++ .../comparator/overall_metrics_comparator.py | 50 ++++ .../compare_bean/api_compare_bean.py | 47 ++++ .../compare_bean/kernel_compare_bean.py | 75 ++++++ .../origin_data_bean/kernel_details_bean.py | 39 ++- .../origin_data_bean/trace_event_bean.py | 50 +++- .../compare_bean/overall_metrics_bean.py | 255 ++++++++++++++++++ .../compare_bean/profiling_info.py | 184 ++++++++++++- .../data_prepare/operator_data_prepare.py | 17 ++ .../disaggregate/overall_perf_interface.py | 28 +- .../generator/detail_performance_generator.py | 37 ++- .../profiling_parser/base_profiling_parser.py | 116 +++++++- .../profiling_parser/gpu_profiling_parser.py | 32 ++- .../profiling_parser/npu_profiling_parser.py | 60 ++++- .../compare_backend/utils/args_manager.py | 13 +- .../compare_backend/utils/compare_args.py | 4 + .../compare_backend/utils/constant.py | 15 +- .../compare_backend/utils/excel_config.py | 123 ++++++++- .../compare_backend/utils/torch_op_node.py | 8 + .../compare_backend/utils/tree_builder.py | 3 +- .../view/work_sheet_creator.py | 39 ++- profiler/compare_tools/img/OverallMetrics.png | Bin 0 -> 66941 bytes profiler/compare_tools/performance_compare.py | 2 + .../module_visualization/graph/prof_node.py | 90 ------- .../graph_build/fwd_module_node.py | 29 -- .../graph_build/prof_graph_builder.py | 115 -------- .../module_visualization/prof_graph_export.py | 39 --- .../prof_parse/prof_data_pre_process.py | 102 ------- profiler/test/run_ut.py | 2 + .../test_dataloader_checker.py | 65 +++++ .../timeline_advice/test_syncbn_checker.py | 62 +++++ .../test_synchronize_stream.py | 55 ++++ .../compute_advice/test_frequency_advice.py | 145 ++++++++++ .../test_kernel_details_bean.py | 4 +- .../test_base_profiling_parser.py | 5 + .../test_gpu_profiling_parser.py | 1 + 107 files changed, 3089 insertions(+), 740 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/att.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml rename profiler/{module_visualization => advisor/analyzer/computation/ai_core_freq}/__init__.py (100%) create mode 100644 profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py create mode 100644 profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py create mode 100644 profiler/advisor/analyzer/dataloader/dataloader_analyzer.py create mode 100644 profiler/advisor/analyzer/dataloader/dataloader_checker.py delete mode 100644 profiler/advisor/analyzer/overall/overall_analyzer.py rename profiler/{module_visualization/graph => advisor/analyzer/schedule/syncbn}/__init__.py (100%) create mode 100644 profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py create mode 100644 profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py rename profiler/{module_visualization/graph_build => advisor/analyzer/schedule/synchronize_stream}/__init__.py (100%) create mode 100644 profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py create mode 100644 profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py create mode 100644 profiler/advisor/analyzer/schedule/timeline_base_checker.py rename profiler/{module_visualization/prof_parse => advisor/dataset/ai_core_freq}/__init__.py (100%) create mode 100644 profiler/advisor/dataset/ai_core_freq/ai_core_freq_dataset.py create mode 100644 profiler/advisor/display/html/templates/ai_core_frequency.html create mode 100644 profiler/advisor/display/html/templates/slow_dataloader.html create mode 100644 profiler/advisor/display/html/templates/sync_batchnorm.html create mode 100644 profiler/advisor/display/html/templates/synchronize_stream.html create mode 100644 profiler/advisor/img/overall_0.png create mode 100644 profiler/advisor/rules/dataloader.yaml create mode 100644 profiler/advisor/rules/sync_batchnorm.yaml create mode 100644 profiler/advisor/rules/synchronize.yaml create mode 100644 profiler/compare_tools/compare_backend/comparator/api_compare_comparator.py create mode 100644 profiler/compare_tools/compare_backend/comparator/kernel_compare_comparator.py create mode 100644 profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py create mode 100644 profiler/compare_tools/compare_backend/compare_bean/api_compare_bean.py create mode 100644 profiler/compare_tools/compare_backend/compare_bean/kernel_compare_bean.py create mode 100644 profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py create mode 100644 profiler/compare_tools/img/OverallMetrics.png delete mode 100644 profiler/module_visualization/graph/prof_node.py delete mode 100644 profiler/module_visualization/graph_build/fwd_module_node.py delete mode 100644 profiler/module_visualization/graph_build/prof_graph_builder.py delete mode 100644 profiler/module_visualization/prof_graph_export.py delete mode 100644 profiler/module_visualization/prof_parse/prof_data_pre_process.py create mode 100644 profiler/test/ut/advisor/advisor_backend/timeline_advice/test_dataloader_checker.py create mode 100644 profiler/test/ut/advisor/advisor_backend/timeline_advice/test_syncbn_checker.py create mode 100644 profiler/test/ut/advisor/advisor_backend/timeline_advice/test_synchronize_stream.py create mode 100644 profiler/test/ut/advisor/compute_advice/test_frequency_advice.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 00000000000..26d33521af1 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/att.iml b/.idea/att.iml new file mode 100644 index 00000000000..b3f73386741 --- /dev/null +++ b/.idea/att.iml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 00000000000..347c8f87556 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,19 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 00000000000..105ce2da2d6 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000000..dc9ea4906e1 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000000..7e32274517b --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000000..35eb1ddfbbc --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/profiler/README.md b/profiler/README.md index 1669e3524e5..549ffefc14c 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -91,6 +91,7 @@ ascend pytorch profiler数据目录结构如下: | profiler版本 | 发布日期 | 下载链接 | 校验码 | | ------------ | ---------- | ------------------------------------------------------------ | ------------------------------------------------------------ | + | 1.2.0 | 2024-07-25 | [msprof_analyze-1.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.2.0/msprof_analyze-1.2.0-py3-none-any.whl) | 6a4366e3beca40b4a8305080e6e441d6ecafb5c05489e5905ac0265787555f37 | | 1.1.2 | 2024-07-12 | [msprof_analyze-1.1.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.2/msprof_analyze-1.1.2-py3-none-any.whl) | af62125b1f9348bf491364e03af712fc6d0282ccee3fb07458bc9bbef82dacc6 | | 1.1.1 | 2024-06-20 | [msprof_analyze-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.1/msprof_analyze-1.1.1-py3-none-any.whl) | 76aad967a3823151421153d368d4d2f8e5cfbcb356033575e0b8ec5acea8e5e4 | | 1.1.0 | 2024-05-28 | [msprof_analyze-1.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.0/msprof_analyze-1.1.0-py3-none-any.whl) | b339f70e7d1e45e81f289332ca64990a744d0e7ce6fdd84a8d82e814fa400698 | diff --git a/profiler/advisor/README.md b/profiler/advisor/README.md index c650f40b3ea..77027110559 100644 --- a/profiler/advisor/README.md +++ b/profiler/advisor/README.md @@ -36,11 +36,11 @@ msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的 3. 查看结果。 - 分析结果输出相关简略建议到执行终端中,并生成`att_advisor_{timestamp}.html`和`att_advisor_{timestamp}.xlsx`文件供用户预览。 + 分析结果输出相关简略建议到执行终端中,并生成`mstt_advisor_{timestamp}.html`和`mstt_advisor_{timestamp}.xlsx`文件供用户预览。 - `att_advisor_{timestamp}.xlsx`文件内容与执行终端输出一致。 + `mstt_advisor_{timestamp}.xlsx`文件内容与执行终端输出一致。 - `att_advisor_{timestamp}.html`文件分析详见“**报告解析**”。 + `mstt_advisor_{timestamp}.html`文件分析详见“**报告解析**”。 执行终端输出示例如下: @@ -72,6 +72,7 @@ msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的 | | block_dim_analysis | block dim算子调优 | | | operator_no_bound_analysis | operator no bound | | | graph | 融合算子图调优 | +| | freq_analysis | AI Core算子降频分析 | | scheduling | timeline_fusion_ops | 亲和API替换调优 | | | timeline_op_dispatch | 识别算子下发问题(路径3/路径5) | @@ -132,6 +133,8 @@ cluster模块的分析包含快慢卡和快慢链路分析,仅识别问题, overall模块的分析包含当前训练任务慢卡的性能拆解,按照计算、通信和下发三个维度进行耗时的统计,可以基于该分析识别到训练性能瓶颈是计算、通信还是下发问题,同样不提供调优建议。 +![输入图片说明](./img/overall_0.png) + ![输入图片说明](./img/overall.png) schedule模块包含亲和API、aclOpCompile、syncBatchNorm、SynchronizeStream等多项检测。 @@ -152,7 +155,7 @@ torch_npu.npu.config.allow_internal_format = False ![schedule_3](./img/schedule_3.png) -computation模块从device计算性能维度进行分析,能够识别AI CPU、计算bound、动态Shape等问题并给出相应建议。此处不再详细展开,按照报告进行调优即可。 +computation模块从device计算性能维度进行分析,能够识别AI CPU、计算bound、动态Shape、AI Core算子降频分析等问题并给出相应建议。此处不再详细展开,按照报告进行调优即可。 ![computation_1](./img/computation_1.png) diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py index 5f4bd3202cd..ada1b0bf4f4 100644 --- a/profiler/advisor/analyzer/base_analyzer.py +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging from functools import wraps from typing import Dict, List, Union @@ -59,14 +73,6 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): def optimize(self, **kwargs): pass - @abstractmethod - def make_record(self): - pass - - @abstractmethod - def make_render(self): - pass - def init_dataset_list(self)->None: dataset_cls_list = self.dataset_cls_list if len(dataset_cls_list) == 0: diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py index 846b79a50f3..0b585cbc7c5 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py @@ -19,7 +19,7 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset class SlowLinkAnalyzer(BaseAnalyzer): @@ -35,11 +35,11 @@ class SlowLinkAnalyzer(BaseAnalyzer): SDMA = "SDMA" RDMA = "RDMA" SLOW_LINK_ANALYSIS = "slow_link_analysis" - dataset_cls_list = [ClusterCommunicationDataSet] + dataset_cls_list = [ClusterCommunicationDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) - key = ClusterCommunicationDataSet.get_key() + key = ClusterCommunicationDataset.get_key() self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) self.rank_bw_dict = self.communication_data_class.get_data() self.result = OptimizeResult() @@ -49,8 +49,9 @@ class SlowLinkAnalyzer(BaseAnalyzer): def optimize(self, **kwargs): if self.rank_bw_dict is None: - print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \ - 如不关心这类数据请忽略") + print("Slow link analysis failed due to data loading failure. \ + Please check your cluster_analysis_output folder. \ + If you are not concerned about this type of data, please ignore this message.") return self.result self.process() self.format_datas = self.format_details() @@ -65,8 +66,11 @@ class SlowLinkAnalyzer(BaseAnalyzer): def produce_bottleneck(self, link_type: str): data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] - avg_bw = round(sum(data_list) / len(data_list), 3) - if avg_bw == 0: + if len(data_list) > 0: + avg_bw = round(sum(data_list) / len(data_list), 3) + else: + print("The slow link (identified bottleneck) cannot provide a bottleneck \ + because the analysis data is missing bandwidth information.") return self.bottelneck += f'{link_type}: \n' \ f' The average is {avg_bw}, \n' \ diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py index aa0ddad5078..f439b31f773 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py @@ -19,7 +19,7 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataSet +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataset class SlowRankAnalyzer(BaseAnalyzer): @@ -27,11 +27,11 @@ class SlowRankAnalyzer(BaseAnalyzer): RANK = "rank" RATIO_THRESHOLD = 0.05 BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] - dataset_cls_list = [ClusterStepTraceTimeDataSet] + dataset_cls_list = [ClusterStepTraceTimeDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) - key = ClusterStepTraceTimeDataSet.get_key() + key = ClusterStepTraceTimeDataset.get_key() self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key) self.step_trace_dict = self.step_trace_class.get_data() self.result = OptimizeResult() diff --git a/profiler/module_visualization/__init__.py b/profiler/advisor/analyzer/computation/ai_core_freq/__init__.py similarity index 100% rename from profiler/module_visualization/__init__.py rename to profiler/advisor/analyzer/computation/ai_core_freq/__init__.py diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py new file mode 100644 index 00000000000..4f25deff7c0 --- /dev/null +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py @@ -0,0 +1,36 @@ +import logging + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.computation.ai_core_freq.ai_core_freq_checker import AICoreFreqChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.ai_core_freq.ai_core_freq_dataset import AICoreFreqDataset +from profiler.advisor.config.config import Config + +logger = logging.getLogger() + + +class AICoreFreqAnalyzer(BaseAnalyzer): + dataset_cls_list = [AICoreFreqDataset] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: + super().__init__(collection_path, n_processes, **kwargs) + key = AICoreFreqDataset.get_key() + self.dataset = self.get_first_data_by_key(self.dataset_list, key) + self.result = OptimizeResult() + self.html_render = HTMLRender() + self.html = None + + @BaseAnalyzer.check_data((AICoreFreqDataset.get_key(),)) + def optimize(self, **kwargs): + if not Config().get_config("aic_frequency"): + logger.warning("Can not find ai core frequency in info.json*, please check data integrity.") + return self.result + add_render_list = kwargs.get("add_render_list", True) + ai_core_freq_checker = AICoreFreqChecker() + ai_core_freq_checker.check_ai_core_freq(self.dataset) + if not ai_core_freq_checker.ai_core_freq_issues: + return self.result + ai_core_freq_checker.make_record(self.result) + self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list) + return self.result diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py new file mode 100644 index 00000000000..5ea4dbd7542 --- /dev/null +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py @@ -0,0 +1,100 @@ +import logging + +from profiler.advisor.dataset.ai_core_freq.ai_core_freq_dataset import AICoreFreqDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.config.config import Config +from profiler.advisor.utils.utils import convert_to_float + +logger = logging.getLogger() + + +class AICoreFreqChecker: + DEFAULT_FREQ = 1800 + DECREASE_FREQ_RATIO = 0.05 + SHOW_TOPK_OPS = 10 + TOTAL_DURATION_INDEX = 2 + DECREASE_FREQ_RATIO_INDEX = 3 + + def __init__(self): + + self.ai_core_freq_issues = False + self.desc = "" + self.suggestions = "" + self.decrease_freq_ops = [] + self.headers = [] + self.op_freq = None + self.rank_id = None + self.stage = None + + def check_ai_core_freq(self, event_dataset: AICoreFreqDataset, rank_id=None, stage=None): + """ + :Param event_dataset: dataset of timeline event + """ + if not hasattr(event_dataset, "op_freq") or not getattr(event_dataset, "op_freq"): + logger.debug("Skip slow ai core frequency checker, " + "because no ai core frequency were recorded in trace_view.json") + return + + self.rank_id = rank_id + self.stage = stage + self.op_freq = event_dataset.op_freq + for op_name, op_info in self.op_freq.items(): + freq_list = op_info.get("freq_list", []) + if not freq_list: + continue + + op_count = op_info.get("count", 0) + op_total_duration = round(op_info.get("dur", 0), 2) + max_freq = max(self.DEFAULT_FREQ, convert_to_float(Config().get_config("aic_frequency"))) + + decrease_freq_ratio = sum(max_freq - freq for freq in freq_list) / (max_freq * len(freq_list)) + if decrease_freq_ratio >= self.DECREASE_FREQ_RATIO: + self.ai_core_freq_issues = True + self.decrease_freq_ops.append([op_name, op_count, op_total_duration, + f"{round(decrease_freq_ratio, 4):.2%}", + round(sum(freq_list) / len(freq_list), 2), + max(freq_list), min(freq_list)]) + + if self.decrease_freq_ops: + # 按算子总耗时和降频比率 降序排列 + self.decrease_freq_ops.sort(key= + lambda x: (x[self.TOTAL_DURATION_INDEX], x[self.DECREASE_FREQ_RATIO_INDEX]), + reverse=True) + + self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " + f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") + if self.rank_id: + self.desc = f"For rank {self.rank_id}, " + self.desc.lower() + self.suggestions = "Please check the temperature or max power of your machine." + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + optimization_item = OptimizeItem("AI Core Frequency", self.desc, [self.suggestions]) + result.add(OptimizeRecord(optimization_item)) + + self.headers = ["Operator name", "Count", "Total duration(us)", "AI CORE frequency decreased ratio", + "Average frequency", "Max frequency", "Min frequency"] + if self.rank_id: + self.headers = ["Rank id"] + self.headers + sub_table_name = "AI Core Frequency" if not self.stage else f"Stage-{self.stage}: AI Core Frequency" + result.add_detail(sub_table_name, headers=self.headers) + + for row in self.decrease_freq_ops: + if self.rank_id: + row = [self.rank_id] + row + result.add_detail(sub_table_name, detail=row) + + def make_render(self, html_render, add_render_list=True): + if self.SHOW_TOPK_OPS: + self.desc += f" Only show {self.SHOW_TOPK_OPS} operators here, see latest mstt_advisor.xlsx for details." + return html_render.render_template(key="computation", + template_dir="templates", + template_name="ai_core_frequency.html", + desc=self.desc, + suggestion=self.suggestions, + headers=self.headers, + data=self.decrease_freq_ops[:self.SHOW_TOPK_OPS], + add_render_list=add_render_list) diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py index 4eca1c6c027..0caede4b894 100644 --- a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py +++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py @@ -3,13 +3,13 @@ import os from functools import partial from typing import List, Dict, Optional -import yaml from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker, logger from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker import OpStackFinder from profiler.advisor.common import constant from profiler.advisor.dataset.dataset import Dataset from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.cluster_analyse.common_func.file_manager import FileManager class AicpuChecker(OperatorChecker): @@ -47,8 +47,8 @@ class AicpuChecker(OperatorChecker): if not os.path.exists(rule_path): logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path) return {} - with open(rule_path, 'r') as f: - self.aicpu_rules = yaml.safe_load(f) + + self.aicpu_rules = FileManager.read_yaml_file(rule_path) self.filter_aicpu_rules(self.aicpu_rules) for checker_name, check_rule in self.aicpu_rules.items(): if not isinstance(check_rule, (list, dict,)): diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index a7d7ddd93c7..7a873c65635 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -1,5 +1,4 @@ import logging - from typing import List from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py index 86826177007..2021bcd5765 100644 --- a/profiler/advisor/analyzer/computation/profiling_analyzer.py +++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py @@ -1,19 +1,15 @@ import logging from abc import ABC -from typing import Dict, List from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer -from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.computation.aicpu.aicpu_checker import AicpuChecker from profiler.advisor.analyzer.computation.bound.block_dim_checker import BlockDimChecker from profiler.advisor.analyzer.computation.bound.operator_bound_checker import OperatorBoundChecker -from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker from profiler.advisor.analyzer.computation.op_compile.dynamic_shape_checker import DynamicShapeChecker from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset -from profiler.advisor.utils.utils import get_supported_subclass logger = logging.getLogger() @@ -76,14 +72,15 @@ class BlockDimAnalyzer(ProfilingAnalyzer): def __init__(self, collection_path, **kwargs) -> None: super().__init__(collection_path, **kwargs) self.checker = BlockDimChecker(self.cann_version) - + class OperatorBoundAnalyzer(ProfilingAnalyzer): def __init__(self, collection_path, **kwargs) -> None: super().__init__(collection_path, **kwargs) self.checker = OperatorBoundChecker(self.cann_version) + class AicpuAnalyzer(ProfilingAnalyzer): def __init__(self, collection_path, **kwargs) -> None: super().__init__(collection_path, **kwargs) - self.checker = AicpuChecker(self.cann_version) \ No newline at end of file + self.checker = AicpuChecker(self.cann_version) diff --git a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py new file mode 100644 index 00000000000..291c3a1f941 --- /dev/null +++ b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py @@ -0,0 +1,30 @@ +import logging + +from typing import List, Dict, Any + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.dataloader.dataloader_checker import DataloaderChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset + +logger = logging.getLogger() + + +class DataloaderAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: + super().__init__(collection_path, n_processes, **kwargs) + key = TimelineEventDataset.get_key() + self.dataset = self.get_first_data_by_key(self.dataset_list, key) + self.result = OptimizeResult() + self.html_render = HTMLRender() + + @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),)) + def optimize(self, **kwargs): + dataloader_checker = DataloaderChecker() + dataloader_checker.check_slow_dataloader(self.dataset) + dataloader_checker.make_record(self.result) + dataloader_checker.make_render(self.html_render) + return self.result diff --git a/profiler/advisor/analyzer/dataloader/dataloader_checker.py b/profiler/advisor/analyzer/dataloader/dataloader_checker.py new file mode 100644 index 00000000000..eb1886284ef --- /dev/null +++ b/profiler/advisor/analyzer/dataloader/dataloader_checker.py @@ -0,0 +1,84 @@ +import os +import re +import logging +import yaml + +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.cluster_analyse.common_func.file_manager import FileManager + +logger = logging.getLogger() + + +class DataloaderChecker: + + def __init__(self): + + self.dataloader_issues = False + self.optimization_item = [] + self.desc = "" + self.suggestions = [] + self.dataloader_duration_threshold = None + self._init_rule() + + def check_slow_dataloader(self, event_dataset: TimelineEventDataset): + """ + :Param event_dataset: dataset of timeline event + """ + if not hasattr(event_dataset, "dataloader") or not getattr(event_dataset, "dataloader"): + logger.debug("Skip slow dataloader checker, because no dataloader duration larger than %s", + self.dataloader_duration_threshold) + return + for event in event_dataset.dataloader: + + dataloader_duration = float(event.dur) / 1000 + if dataloader_duration < self.dataloader_duration_threshold: + continue + self.desc = self.desc.format(dataloader_duration=dataloader_duration, + dataloader_duration_threshold=self.dataloader_duration_threshold) + self.dataloader_issues = True + + if re.search("singleprocess", event.name.lower()): + self.suggestions = self._reset_suggestions(["I/O", "num_workers"]) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.dataloader_issues: + return + + self.optimization_item.append(OptimizeItem("Slow dataloader", self.desc, self.suggestions)) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + + def make_render(self, html_render): + if not self.dataloader_issues: + return + html_render.render_template(key="dataloader", + template_dir="templates", + template_name="slow_dataloader.html", + desc=self.desc, + suggestions=self.suggestions) + + def _init_rule(self): + dataloader_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + "rules", + "dataloader.yaml" + ) + dataloader_rule = FileManager.read_yaml_file(dataloader_rule_path) + + self.dataloader_duration_threshold = dataloader_rule.get("dataloader_duration_threshold") + self.desc = dataloader_rule.get("problem") + self.suggestions = dataloader_rule.get("solutions") + + def _reset_suggestions(self, suggestion_pattern_list): + + suggestions = [] + for solution in self.suggestions: + for suggestion_pattern in suggestion_pattern_list: + if re.search(suggestion_pattern, solution): + suggestions.append(solution) + return suggestions diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py index e64020fdfe2..30bd4323795 100644 --- a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py @@ -149,7 +149,7 @@ class GraphFusionRules: optimization_item = OptimizeItem( "fusion issue", f"Found {len(self.candidates)} fusion issues", - ["Check fusion issues detail in att_advisor*.html"] + ["Check fusion issues detail in mstt_advisor*.html"] ) total_time = 0.0 for candidate in self.task_duration_list: diff --git a/profiler/advisor/analyzer/overall/overall_analyzer.py b/profiler/advisor/analyzer/overall/overall_analyzer.py deleted file mode 100644 index 916a396b3d0..00000000000 --- a/profiler/advisor/analyzer/overall/overall_analyzer.py +++ /dev/null @@ -1,45 +0,0 @@ -import logging -from typing import Dict, List - -from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer -from profiler.advisor.display.html.render import HTMLRender -from profiler.advisor.result.result import OptimizeResult -from profiler.compare_tools.compare_backend.utils.constant import Constant -from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface - -logger = logging.getLogger() - - -class OverallSummaryAnalyzer(BaseAnalyzer): - - def __init__(self, profiling_path, benchmark_profiling_path=None, **kwargs): - self.benchmark_profiling_path = benchmark_profiling_path or profiling_path - self.profiling_path = profiling_path - self.html_render = HTMLRender() - self.result = OptimizeResult() - - def optimize(self, **kwargs): - compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare( - Constant.OVERALL_COMPARE) - - headers = compare_result.get('Model Profiling Time Distribution').get("headers", []) - rows = compare_result.get('Model Profiling Time Distribution').get("rows", []) - - self.make_record() - self.make_render(headers=headers, rows=rows) - return compare_result - - def make_record(self): - pass - - def make_render(self, **kwargs): - headers = kwargs.get("headers") - rows = kwargs.get("rows") - - if not headers or not rows: - logger.info("Empty headers or rows, skip render overall analysis html") - self.html_render.render_template(key="overall", - template_dir="templates", - template_name="overall_analysis.html", - headers=kwargs.get("headers"), - rows=kwargs.get("rows")) diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index c74ae051033..8e93dbda77d 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -13,27 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -import copy - -import logging -from typing import Dict, List +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.display.html.render import HTMLRender -from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult from profiler.compare_tools.compare_backend.utils.constant import Constant -from profiler.advisor.common import constant as const from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface -from profiler.advisor.utils.utils import get_file_path_from_directory, load_parameter class OverallSummaryAnalyzer(BaseAnalyzer): OVERALL_SUMMARY_ANALYZER = "overall_summary_analysis" advice_map = { - "Computing Time": "if you want more detailed advice please go to att_advisor_*.html", - "Uncovered Communication Time": "if you want more detailed advice please go to att_advisor_*.html", - "Free Time": "if you want more detailed advice please go to att_advisor_*.html" + "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html" } time_name_map = { "Computing Time": "computing", @@ -47,45 +41,37 @@ class OverallSummaryAnalyzer(BaseAnalyzer): 'SDMA Time(Num)': 'SDMA Time' } performance_time_dict = { - "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', - 'Flash Attention Time(Backward)(Num)', 'Other Time'], - "Uncovered Communication Time(Wait Time)": [], - "Free Time": ['SDMA Time(Num)'] + "Computing Time": "computing_time_ms", + " -- Flash Attention": "fa_time_ms", + " -- Conv": "conv_time_ms", + " -- Matmul": "matmul_time_ms", + " -- Vector": "vector_time_ms", + " -- SDMA(Tensor Move)": "tensor_move_time_ms", + " -- Other Cube": "other_cube_time_ms", + "Uncovered Communication Time": "uncovered_communication_time_ms", + " -- Wait": "wait_time_ms", + " -- Transmit": "transmit_time_ms", + "Free Time": "free_time_ms", + " -- SDMA": "sdma_time_ms", + " -- Free": "free_ms", + "E2E Time": "e2e_time_ms" } def __init__(self, collection_path: str, n_processes: int = 1, **kwargs): profile_path = get_profile_path(collection_path) super().__init__(profile_path, n_processes, **kwargs) - self.base_collection_path = kwargs.get("base_collection_path", "") - self._has_base_collection = False + self.benchmark_profiling_path = kwargs.get("benchmark_profiling_path", "") + self._has_benchmark_profiling = False self._is_minimal_profiling = False self.cur_data = {} - self.cur_data_table = {} self.cur_bottleneck = {} + self._disaggregate_perf = {} + self._disaggregate_benchmark_perf = {} self.cur_advices = "" - self._headers = [] - self._base_data = [] - self._comparison_data = [] self.html_render = HTMLRender() self.result = OptimizeResult() self.bottleneck_str = "" - self.bottleneck_table = {} - - @staticmethod - def split_duration_and_num(time_value: str) -> tuple: - split_data = time_value.split("s") # time value example: 0.229s(1756) - duration, num = 0.0, None - if len(split_data) >= 2: - try: - num = int(split_data[1].strip("()")) - except ValueError: - pass - if len(split_data) >= 1: - try: - duration = float(split_data[0]) - except ValueError: - print(f"[WARNING] Invalid time value: {time_value}.") - return duration, num + self.over_summary_analysis = {} @staticmethod def calculate_ratio(dividend, divisor): @@ -93,131 +79,121 @@ class OverallSummaryAnalyzer(BaseAnalyzer): return float("inf") return dividend / divisor + @staticmethod + def get_time_category_dict(overall_dict: dict): + time_category_dict = { + "Computing Time": round(overall_dict.get('computing_time_ms', 0.0), 3), + "Uncovered Communication Time": round(overall_dict.get('uncovered_communication_time_ms', 0.0), 3), + "Free Time": round(overall_dict.get('free_time_ms', 0.0), 3) + } + return time_category_dict + def path_check(self): - if self.base_collection_path: - if os.path.exists(self.base_collection_path): - self._has_base_collection = True + if self.benchmark_profiling_path: + if os.path.exists(self.benchmark_profiling_path): + self._has_benchmark_profiling = True else: - print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.") + print(f"[WARNING] Invalid path which not exists: {self.benchmark_profiling_path}.") return os.path.exists(self.collection_path) def process(self): - base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path - result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE) - for data in result_data.values(): - self._headers = data.get("headers", []) - rows = data.get("rows", []) - if len(rows) == 2: - self._base_data = rows[0] - self._comparison_data = rows[1] - if not self._headers or not self._comparison_data: + self._disaggregate_perf = ComparisonInterface(self.collection_path).disaggregate_perf(Constant.OVERALL_COMPARE) + if not self._disaggregate_perf: return - self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers - if self._has_base_collection: - self.cur_data["comparison_result"] = result_data - time_category_dict = {} - for time_category, time_list in self.performance_time_dict.items(): - time_value = self.get_time_value(time_category, self._comparison_data) - if time_value == Constant.INVALID_VALUE: - continue - duration, _ = self.split_duration_and_num(time_value) - time_category = time_category.split("(")[0] - time_category_dict[time_category] = duration - self.get_sub_category_time(time_category, time_list, duration) - self.cur_data["overall_data"] = time_category_dict - - def get_time_value(self, header_name: str, data_list: list): - try: - data_index = self._headers.index(header_name) - except ValueError: - return Constant.INVALID_VALUE - try: - time_value = data_list[data_index] - except IndexError: - return Constant.INVALID_VALUE - return time_value - - def get_sub_category_time(self, category: str, time_list: list, total_duration: float): - sub_time_dict = {} - for time_name in time_list: - time_value = self.get_time_value(time_name, self._comparison_data) - if time_value == Constant.INVALID_VALUE: - continue - sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, "")) - duration, num = self.split_duration_and_num(time_value) - sub_time_dict.setdefault(f"Duration(s)", []).append(duration) - sub_time_dict.setdefault(f"Duration Ratio", []).append( - "{:.2%}".format(self.calculate_ratio(duration, total_duration))) - sub_time_dict.setdefault(f"Kernel Number", []).append(num) - self.cur_data[self.time_name_map.get(category)] = sub_time_dict + self._is_minimal_profiling = self._disaggregate_perf.get("minimal_profiling", False) + self.cur_data["overall_data"] = self.get_time_category_dict(self._disaggregate_perf.get('overall', {})) + if self._has_benchmark_profiling: + self._disaggregate_benchmark_perf = ComparisonInterface( + self.benchmark_profiling_path).disaggregate_perf(Constant.OVERALL_COMPARE) def identify_bottleneck(self): overall_data = self.cur_data.get("overall_data") if not overall_data: return e2e_time = '%.3f' % sum([data for data in overall_data.values()]) - overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n" + overall_bottleneck = f"The Model E2E Time is {e2e_time}ms.\n" comparison_bottleneck = "" for time_type, time_value in overall_data.items(): - # add subtype time bottleneck - self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n" # add overall bottleneck - overall_bottleneck += f" -- {time_type} is {time_value}s\n" + overall_bottleneck += f" -- {time_type} is {time_value}ms\n" if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value, e2e_time) > 0.1: overall_bottleneck += "percentage of free time exceed the threshold 10%." - if not self._has_base_collection: + if not self._has_benchmark_profiling: continue # add comparison bottleneck - time_type_origin = "Uncovered Communication Time(Wait Time)" \ - if time_type == "Uncovered Communication Time" else time_type - base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data)) + base_duration = self.get_time_category_dict( + self._disaggregate_benchmark_perf.get('overall', {}) + ).get(time_type) if time_value > base_duration: ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration)) comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n" self.cur_bottleneck["overall_data"] = overall_bottleneck if comparison_bottleneck: self.cur_bottleneck["comparison_result"] = comparison_bottleneck + def optimize(self, **kwargs): if self.path_check(): self.process() self.identify_bottleneck() self.format_bottleneck() - self.format_cur_data() + self.format_over_summary_analysis() self.make_record() self.make_render() return self.result def format_bottleneck(self): result = '' - headers = [] - data_list = [] - data = [] - for key, value in self.cur_bottleneck.items(): + for _, value in self.cur_bottleneck.items(): if not value: continue - result += f'{key}: {value} \n' - headers.append(key) - data.append(value) - data_list.append(data) + result += f'{value} \n' self.bottleneck_str = result - self.bottleneck_table["headers"] = headers - self.bottleneck_table["data"] = data_list - def format_cur_data(self): - if not self.cur_data: - return - for data_type, data in self.cur_data.items(): - if not data: - continue - if data_type not in list(self.time_name_map.values()): - data_list = list(data.values()) - else: - data_list = [','.join(map(str, value)) for value in data.values()] - headers = list(data.keys()) - data_table = {"headers": headers, "data": [data_list]} - self.cur_data_table[data_type] = copy.deepcopy(data_table) + def format_over_summary_analysis(self): + headers = ['Performance Index', 'Duration(ms)', 'Duration Ratio'] + performance_data = self.get_analysis_data(self._disaggregate_perf) + benchmark_data = self.get_analysis_data(self._disaggregate_benchmark_perf) + if self._has_benchmark_profiling: + headers.append('Diff Duration(ms)') + self.format_analysis_with_benchmark(performance_data, benchmark_data, headers) + else: + self.format_analysis_only(performance_data, headers) + + def get_analysis_data(self, data_dict: dict): + if not data_dict: + return {} + return { + **data_dict.get("overall"), + **data_dict.get("computing_time_disaggregate"), + **data_dict.get("communication_time_disaggregate"), + **data_dict.get("free_time_disaggregate"), + } + def format_analysis_only(self, performance_data: dict, headers: list): + res = [] + total_duration = performance_data.get('e2e_time_ms', 0.0) + for time_name, time_key in self.performance_time_dict.items(): + row = [time_name] + duration = performance_data.get(time_key, 0.0) + row.append("{:.3f}".format(duration)) + row.append("{:.2%}".format(self.calculate_ratio(duration, total_duration))) + res.append(row) + self.over_summary_analysis["headers"] = headers + self.over_summary_analysis["data"] = res + + def format_analysis_with_benchmark(self, performance_data: dict, benchmark_data: dict, headers: list): + res = [] + total_duration = performance_data.get('e2e_time_ms', 0.0) + for time_name, time_key in self.performance_time_dict.items(): + row = [time_name] + duration = performance_data.get(time_key, 0.0) + row.append("{:.3f}".format(duration)) + row.append("{:.2%}".format(self.calculate_ratio(duration, total_duration))) + row.append("{:.3f}".format(duration - benchmark_data.get(time_key, 0.0))) + res.append(row) + self.over_summary_analysis["headers"] = headers + self.over_summary_analysis["data"] = res def make_record(self): """ @@ -232,20 +208,23 @@ class OverallSummaryAnalyzer(BaseAnalyzer): ) self.result.add(OptimizeRecord(optimization_item)) - self.result.add_detail(const.BOTTLENECK, self.bottleneck_table["headers"], self.bottleneck_table["data"][0]) - for data_type, data_dict in self.cur_data_table.items(): - if data_dict: - self.result.add_detail(const.DATA + data_type, data_dict["headers"], data_dict["data"][0]) + self.result.add_detail( + OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + headers=self.over_summary_analysis["headers"] + ) + for data in self.over_summary_analysis["data"]: + self.result.add_detail(OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, detail=data) def make_render(self): if not self.bottleneck_str and not self.cur_advices: return + # 将\n替换为html换行 + bottleneck_str = self.bottleneck_str.replace('\n', '
') result_for_html = { - "Description" : self.bottleneck_str, - "suggestion" : self.cur_advices, - "details" : [self.bottleneck_table] + "Description": bottleneck_str, + "suggestion": self.cur_advices, + "details": [self.over_summary_analysis] } - self.html_render.render_template(key="overall", title=OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, template_dir="templates", @@ -254,9 +233,10 @@ class OverallSummaryAnalyzer(BaseAnalyzer): torch_version=self.torch_version, result=result_for_html) + def get_profile_path(collection_path): for root, dirs, files in os.walk(collection_path): for file in files: if file.startswith("profiler_info"): return root - return "" \ No newline at end of file + return "" diff --git a/profiler/module_visualization/graph/__init__.py b/profiler/advisor/analyzer/schedule/syncbn/__init__.py similarity index 100% rename from profiler/module_visualization/graph/__init__.py rename to profiler/advisor/analyzer/schedule/syncbn/__init__.py diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py new file mode 100644 index 00000000000..fc6dfce5f0b --- /dev/null +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py @@ -0,0 +1,30 @@ +import logging + +from typing import List, Dict, Any + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.schedule.syncbn.syncbn_checker import SyncBNChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset + +logger = logging.getLogger() + + +class SyncBNAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + + def __init__(self, collection_path, **kwargs): + super().__init__(collection_path, **kwargs) + self.result = OptimizeResult() + self.html_render = HTMLRender() + key = TimelineEventDataset.get_key() + self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) + + @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),)) + def optimize(self, **kwargs): + syncbn_checker = SyncBNChecker() + syncbn_checker.check_syncbn(self.timeline_event_dataset) + syncbn_checker.make_record(self.result) + syncbn_checker.make_render(self.html_render) + return self.result diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py new file mode 100644 index 00000000000..83988c4e60b --- /dev/null +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py @@ -0,0 +1,70 @@ +import logging +import os + +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.cluster_analyse.common_func.file_manager import FileManager + +logger = logging.getLogger() + + +class SyncBNChecker: + + def __init__(self): + self.optimization_item = [] + self.syncbn_issues = False + self.desc = "" + self.suggestions = [] + self.solutions = None + self.max_syncbn_num = None + self._init_rule() + + def check_syncbn(self, event_dataset: TimelineEventDataset): + """ + :Param event_dataset: dataset of timeline event + """ + if not hasattr(event_dataset, "sync_batchnorm") or not getattr(event_dataset, "sync_batchnorm"): + logger.debug("Skip syncbn checker, because no syncbn found") + return + + syncbn_num = len(event_dataset.sync_batchnorm) + self.syncbn_issues = syncbn_num >= self.max_syncbn_num + self.desc = self.desc.format(syncbn_num=syncbn_num) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.syncbn_issues: + return + + self.optimization_item.append(OptimizeItem("SyncBatchNorm", self.desc, self.suggestions)) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + + def make_render(self, html_render): + if not self.syncbn_issues: + return + html_render.render_template(key="schedule", + template_dir="templates", + template_name="sync_batchnorm.html", + desc=self.desc, + solutions=self.solutions) + + def _init_rule(self): + syncbn_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), + "rules", + "sync_batchnorm.yaml" + ) + + syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) + + self.max_syncbn_num = syncbn_rule.get("max_syncbn_num") + self.desc = syncbn_rule.get("problem") + + self.solutions = syncbn_rule.get("solutions") + for solution in self.solutions: + for key, val in solution.items(): + self.suggestions.append(f"{key}, {val.get('desc')}") diff --git a/profiler/module_visualization/graph_build/__init__.py b/profiler/advisor/analyzer/schedule/synchronize_stream/__init__.py similarity index 100% rename from profiler/module_visualization/graph_build/__init__.py rename to profiler/advisor/analyzer/schedule/synchronize_stream/__init__.py diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py new file mode 100644 index 00000000000..88e55449c55 --- /dev/null +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py @@ -0,0 +1,32 @@ +import logging + +from typing import List, Dict, Any + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_checker import SynchronizeStreamChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset + +logger = logging.getLogger() + + +class SynchronizeStreamAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + + def __init__(self, collection_path, **kwargs): + super().__init__(collection_path, **kwargs) + self.result = OptimizeResult() + self.html_render = HTMLRender() + + key = TimelineEventDataset.get_key() + self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) + + @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),)) + def optimize(self, **kwargs): + + synchronize_stream_checker = SynchronizeStreamChecker() + synchronize_stream_checker.check_synchronize(self.timeline_event_dataset, kwargs.get("profiling_with_stack")) + synchronize_stream_checker.make_record(self.result) + synchronize_stream_checker.make_render(self.html_render) + return self.result diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py new file mode 100644 index 00000000000..03d88d281ca --- /dev/null +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py @@ -0,0 +1,89 @@ +import logging + +from profiler.advisor.common import constant as const +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker +from profiler.advisor.utils.utils import format_timeline_result + +logger = logging.getLogger() + + +class SynchronizeStreamChecker(TimelineBaseChecker): + + def __init__(self): + super().__init__(n_processes=1) + self.optimization_item = [] + self.synchronize_issues = False + self.desc = "" + self.suggestions = [] + self.solutions = [] + self.max_synchronize_num = None + + def check_synchronize(self, event_dataset: TimelineEventDataset, profiling_with_stack=None): + """ + :Param event_dataset: dataset of timeline event + """ + if not hasattr(event_dataset, "synchronize_stream") or not getattr(event_dataset, "synchronize_stream"): + logger.debug("Skip synchronize stream checker, because no synchronize stream found") + return + + synchronize_num = event_dataset.synchronize_stream.total_count + slow_synchronize_stream = event_dataset.synchronize_stream.slow_synchronize_stream + total_slow_synchronize_time = sum((float(sync_stream.dur) for sync_stream in slow_synchronize_stream)) + + synchronize_stream_rule = event_dataset.synchronize_stream.rule + self.max_synchronize_num = synchronize_stream_rule.get("max_synchronize_num") + self.synchronize_issues = synchronize_num >= self.max_synchronize_num and len(slow_synchronize_stream) > 0 + if not self.synchronize_issues: + return + + for sync_stream in slow_synchronize_stream: + if sync_stream.name not in self._matched_op_index: + self._matched_op_index[sync_stream.name] = [] + self._matched_op_index[sync_stream.name].append(sync_stream.dataset_index) + self.query_stack(event_dataset, profiling_with_stack) + + self.desc = synchronize_stream_rule.get("problem") + self.desc = self.desc.format(synchronize_num=synchronize_num, + slow_synchronize_num=len(slow_synchronize_stream), + total_synchronize_stream_time=total_slow_synchronize_time) + + solutions = synchronize_stream_rule.get("solutions") + for solution in solutions: + renderer_solution = {} + for key, val in solution.items(): + if self.empty_stacks and self.framework_black_list: + # 如果堆栈源于torch, torch_npu等框架,则不提示修改的代码 + if "modify code" in key.lower(): + continue + self.suggestions.append(f"{key}, {val.get('desc')}") + renderer_solution.update({key: val}) + self.solutions.append(renderer_solution) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.synchronize_issues: + return + + self.optimization_item.append(OptimizeItem("SynchronizeStream", self.desc, self.suggestions)) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + + def make_render(self, html_render): + if not self.synchronize_issues: + return + + format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) + html_render.render_template(key="schedule", + template_dir="templates", + template_name="synchronize_stream.html", + desc=self.desc, + solutions=self.solutions, + result=format_result_for_html, + with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL, + empty_stacks=self.empty_stacks, + framework_black_list=self.framework_black_list) diff --git a/profiler/advisor/analyzer/schedule/timeline_base_checker.py b/profiler/advisor/analyzer/schedule/timeline_base_checker.py new file mode 100644 index 00000000000..8bc69150263 --- /dev/null +++ b/profiler/advisor/analyzer/schedule/timeline_base_checker.py @@ -0,0 +1,91 @@ +from abc import ABC, abstractmethod +import multiprocessing +import logging + +from profiler.advisor.common import constant as const +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.result import OptimizeResult + +logger = logging.getLogger() + + +class TimelineBaseChecker(ABC): + + def __init__(self, n_processes: int = 1): + self.n_processes = n_processes + self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict() + self.matched_op_stacks = {} + self.empty_stacks = True + self.framework_black_list = False + + @abstractmethod + def make_record(self, result: OptimizeResult): + pass + + @abstractmethod + def make_render(self, html_render): + pass + + def query_stack(self, event_dataset: TimelineEventDataset = None, profiling_with_stack: str = None): + if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]): + return + + event_dataset = event_dataset if not profiling_with_stack else TimelineEventDataset( + collection_path=profiling_with_stack, data={}, _datasets={}, analysis_mode="fusion_ops", + build_dataset=False) + + op_stack_list = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index) + for op_stack in op_stack_list: + for op, stack in op_stack.items(): + if op not in self.matched_op_stacks: + self.matched_op_stacks[op] = {} + if stack == const.TIMELINE_FUSION_OPS_NO_STACK_FLAG: + continue + if stack not in self.matched_op_stacks[op]: + self.matched_op_stacks[op][stack] = 0 + self.matched_op_stacks[op][stack] += 1 + + def _query_stack_by_matched_index(self, index, event): + stack_record = {} + event = TimelineEvent(event) + + matched_ops = [] + for op, matched_index in self._matched_op_index.items(): + if index not in matched_index: + continue + + matched_ops.append(op) + stack = event.args.get(const.CALL_STACKS) + + if not stack: + logger.debug("Got empty '%s' for event %s", const.CALL_STACKS, event) + continue + + if not self._is_keep_stack(stack): + self.framework_black_list = True + logger.debug("Drop stack from framework %s", const.FRAMEWORK_STACK_BLACK_LIST) + continue + + if self.empty_stacks and stack: + self.empty_stacks = False + + stack_record[op] = stack + + if matched_ops and not stack_record: + for op in matched_ops: + stack_record[op] = const.TIMELINE_FUSION_OPS_NO_STACK_FLAG + + return stack_record + + def _is_keep_stack(self, stack): + # 过滤掉torch, torch_npu, megatron, deepspeed等框架下的堆栈,这些源码基本是不能被修改的 + stack_list = stack.replace("\\r\\n", ";").split(";") + if not stack_list: + return False + + final_called_stack = stack_list[0] + for framework in const.FRAMEWORK_STACK_BLACK_LIST: + if framework in final_called_stack.split("/"): + return False + return True diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py index 592f9d421e2..52e3e07554f 100644 --- a/profiler/advisor/common/analyzer_scopes.py +++ b/profiler/advisor/common/analyzer_scopes.py @@ -12,3 +12,7 @@ class SupportedScopes: BLOCK_DIM_ANALYSIS = "block_dim_analysis" OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" TIMELINE_OP_DISPATCH = "timeline_op_dispatch" + DATALOADER = "dataloader" + SYNCBN = "syncbn" + SYNCHRONIZE_STREAM = "synchronize_stream" + FREQ_ANALYSIS = "freq_analysis" diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py index 697430ee6ca..87245a43ea3 100644 --- a/profiler/advisor/common/constant.py +++ b/profiler/advisor/common/constant.py @@ -26,6 +26,7 @@ ENQUEUE = "enqueue" TORCH_TO_NPU = "torch_to_npu" OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute" OP_COMPILE_ID = "aclopCompileAndExecute" +SYNC_STREAM = "AscendCL@aclrtSynchronizeStream" MAX_OP_COMPILE_NUM = 20 ACL_TO_NPU = "acl_to_npu" TASK_TYPE = "Task Type" @@ -111,7 +112,7 @@ HTTP_PREFIXES = "http://" HTTPS_PREFIXES = "https://" COMMON_YAML_DIR = "modelarts/solution/ma_advisor_rules/" COMMON_ENDPOINT_SUFFIX = "obs.{}.myhuaweicloud.com" -INNER_ENDPOINT_SUFFIX= "obs.{}.ulanqab.huawei.com" +INNER_ENDPOINT_SUFFIX = "obs.{}.ulanqab.huawei.com" AICPU_RULES_YAML_NAME = "aicpu_rules.yaml" FUSION_PASS_YAML_NAME = "op_fusion_pass.yaml" @@ -138,4 +139,8 @@ CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv" CLUSTER_COMM_JSON = "cluster_communication.json" BOTTLENECK = "bottleneck" -DATA = "data" \ No newline at end of file +DATA = "data" + +FRAMEWORK_STACK_BLACK_LIST = ["torch", "torch_npu", "megatron", "deepspeed"] +DISABLE_STREAMING_READER = "DISABLE_STREAMING_READER" +MAX_FILE_SIZE = 10**10 diff --git a/profiler/advisor/common/graph/graph_parser.py b/profiler/advisor/common/graph/graph_parser.py index d4c67fc1918..ef4dc4d681e 100644 --- a/profiler/advisor/common/graph/graph_parser.py +++ b/profiler/advisor/common/graph/graph_parser.py @@ -1,11 +1,12 @@ import os import logging -import yaml import itertools from collections import deque from dataclasses import dataclass from typing import List, Tuple, Dict +from profiler.cluster_analyse.common_func.file_manager import FileManager + logger = logging.getLogger() @@ -344,9 +345,9 @@ class QueryGraphParser: if not os.path.exists(rule_database): raise FileNotFoundError(f"Path {rule_database} does not exist.") - with open(rule_database, 'r') as f: - database = yaml.safe_load(f) - self.parse_yaml(database) + + database = FileManager.read_yaml_file(rule_database) + self.parse_yaml(database) def parse_yaml(self, yaml_database): fusion_strategy_list = yaml_database.get("GraphFusion", []) diff --git a/profiler/advisor/common/profiling/ge_info.py b/profiler/advisor/common/profiling/ge_info.py index 9996ec611a2..4fd5846d88d 100644 --- a/profiler/advisor/common/profiling/ge_info.py +++ b/profiler/advisor/common/profiling/ge_info.py @@ -17,12 +17,13 @@ class GeInfo(ProfilingParser): """ ge info file """ - FILE_PATTERN = r"ge_info.db" FILE_PATTERN_MSG = "ge_info.db" FILE_INFO = "ge info" STATIC_OP_STATE = "0" DYNAMIC_OP_STATE = "1" + file_pattern_list = [r"ge_info.db"] + def __init__(self, path: str) -> None: super().__init__(path) self.op_state_info_list = None diff --git a/profiler/advisor/common/profiling/msprof.py b/profiler/advisor/common/profiling/msprof.py index 9453986b822..750c5481e67 100644 --- a/profiler/advisor/common/profiling/msprof.py +++ b/profiler/advisor/common/profiling/msprof.py @@ -33,10 +33,11 @@ class Msprof(ProfilingParser): msprof """ - FILE_PATTERN = r"^msprof[_\d]+.json$" FILE_PATTERN_MSG = "msprof_*.json" FILE_INFO = "msprof" + file_pattern_list = [r"^msprof[_\d]+.json$"] + def __init__(self, path: str) -> None: super().__init__(path) self._tasks: List[TaskInfo] = [] diff --git a/profiler/advisor/common/profiling/op_summary.py b/profiler/advisor/common/profiling/op_summary.py index d79439dbad8..4744b5029ad 100644 --- a/profiler/advisor/common/profiling/op_summary.py +++ b/profiler/advisor/common/profiling/op_summary.py @@ -16,13 +16,13 @@ class OpSummary(ProfilingParser): """ op summary """ - - FILE_PATTERN = r"^op_summary_[_\d]+\.csv$" FILE_PATTERN_MSG = "op_summary_*.csv" FILE_INFO = "op summary" STATIC_OP_STATE = "static" DYNAMIC_OP_STATE = "dynamic" + file_pattern_list = [r"^op_summary_[_\d]+\.csv$"] + def __init__(self, path: str) -> None: super().__init__(path) self.op_list: List[OpInfo] = [] diff --git a/profiler/advisor/common/profiling/tasktime.py b/profiler/advisor/common/profiling/tasktime.py index 3ce09a78385..732ff0f3679 100644 --- a/profiler/advisor/common/profiling/tasktime.py +++ b/profiler/advisor/common/profiling/tasktime.py @@ -17,11 +17,11 @@ class TaskTime(ProfilingParser): """ task time info """ - - FILE_PATTERN = r"^task_time_[_\d]+\.json$" FILE_PATTERN_MSG = "task_time*.json" FILE_INFO = "task time" + file_pattern_list = [r"^task_time_[_\d]+\.json$"] + def __init__(self, path: str) -> None: super().__init__(path) self._tasks: List[TaskInfo] = [] diff --git a/profiler/advisor/common/timeline/event.py b/profiler/advisor/common/timeline/event.py index 6001ac88722..e24d983a02f 100644 --- a/profiler/advisor/common/timeline/event.py +++ b/profiler/advisor/common/timeline/event.py @@ -1,3 +1,4 @@ +from decimal import Decimal class AdvisorDict(dict): def __getstate__(self): return self.__dict__ @@ -18,6 +19,6 @@ class AdvisorDict(dict): class TimelineEvent(AdvisorDict): def ts_include(self, event): - - return float(self.ts) <= float(event.ts) and float(self.ts) + float(self.dur) >= float(event.ts) + float( + return Decimal(self.ts) <= Decimal(event.ts) and Decimal(self.ts) + Decimal(self.dur) >= Decimal( + event.ts) + Decimal( event.dur) \ No newline at end of file diff --git a/profiler/advisor/common/timeline/fusion_ops_db.py b/profiler/advisor/common/timeline/fusion_ops_db.py index 8637befd1ab..64cc849295f 100644 --- a/profiler/advisor/common/timeline/fusion_ops_db.py +++ b/profiler/advisor/common/timeline/fusion_ops_db.py @@ -1,13 +1,12 @@ import logging import os -import yaml - from profiler.advisor.common import constant from profiler.advisor.common.timeline.fusion_ops_rule import OpRule from profiler.advisor.common.timeline.fusion_ops_rule_handler import TimelineOpRuleHandler from profiler.advisor.utils.log import get_log_level from profiler.advisor.utils.utils import get_file_path_by_walk +from profiler.cluster_analyse.common_func.file_manager import FileManager logger = logging.getLogger() logger.setLevel(get_log_level()) @@ -241,8 +240,7 @@ class FusionOperatorDB: logger.debug("The rule yaml file is successfully found in path: %s", os.path.abspath(file_path)) - with open(file_path, "rb") as file: - db_content = yaml.safe_load(file) + db_content = FileManager.read_yaml_file(file_path) if not self._is_version_supported(db_content): self.is_empty = True diff --git a/profiler/advisor/config/config.ini b/profiler/advisor/config/config.ini index c56c1dad9f0..06e99316010 100644 --- a/profiler/advisor/config/config.ini +++ b/profiler/advisor/config/config.ini @@ -9,6 +9,7 @@ tune_ops_file = operator_tuning_file.cfg [THRESHOLD] # operator_bound_ratio: (mte, cube, vector, scalar) ratio greater than this value will be checked in operator_bound_checker operator_bound_ratio = 0.8 +frequency_threshold = 0.05 [RULE-BUCKET] # region : URL of different regions where can download rule yaml file cn-north-9 = cnnorth9-modelarts-sdk diff --git a/profiler/advisor/config/config.py b/profiler/advisor/config/config.py index 12f4526f8c9..4f36dfedfc8 100644 --- a/profiler/advisor/config/config.py +++ b/profiler/advisor/config/config.py @@ -97,6 +97,13 @@ class Config: """ return float(self.config.get("THRESHOLD", "operator_bound_ratio")) + @property + def frequency_threshold(self) -> float: + """ + frequency_threshold + """ + return float(self.config.get("THRESHOLD", "frequency_threshold")) + def set_log_path(self, result_file: str, log_path: str = None): self.log_path = log_path if log_path is not None else os.path.join(self._work_path, "log") os.makedirs(self.log_path, exist_ok=True) diff --git a/profiler/advisor/config/profiling_data_version_config.yaml b/profiler/advisor/config/profiling_data_version_config.yaml index 4ef76105a07..b8c92fe074d 100644 --- a/profiler/advisor/config/profiling_data_version_config.yaml +++ b/profiler/advisor/config/profiling_data_version_config.yaml @@ -1,18 +1,19 @@ versions: - version: 8.0.RC1 dirs_pattern: + ASCEND_PROFILER_OUTPUT: [ op_summary ] ^PROF_\d{6}_\d{17}_\w+$: - mindstudio_profiler_output: - [ op_summary, msprof ] + mindstudio_profiler_output: [ op_summary, msprof ] class_attr: op_summary: OpSummary msprof: Msprof file_attr: - op_summary: ^op_summary_\d{14}\.csv$ msprof: ^msprof_\d{14}\.json$ + op_summary: [ kernel_details.csv, '^op_summary_\d{14}\.csv$' ] - version: 7.0.0 dirs_pattern: + ASCEND_PROFILER_OUTPUT: [ op_summary ] ^PROF_\d{6}_\d{17}_\w+$: ^device_\d+$: summary: @@ -28,13 +29,14 @@ versions: msprof: Msprof ge_info: GeInfo file_attr: - op_summary: ^op_summary_\d+_\d+_\d{14}\.csv$ + op_summary: [ kernel_details.csv, '^op_summary_\d+_\d+_\d{14}\.csv$'] task_time: ^task_time_\d+_\d+_\d{14}\.json$ msprof: ^msprof_\d+_\d+_\d{14}\.json$ ge_info: ge_info.db - version: 7.0.RC1 dirs_pattern: + ASCEND_PROFILER_OUTPUT: [ op_summary ] ^PROF_\d{6}_\d{17}_\w+$: ^device_\d+$: summary: @@ -50,13 +52,14 @@ versions: msprof: Msprof ge_info: GeInfo file_attr: - op_summary: ^op_summary_\d+_\d+_\d+_\d{14}\.csv$ + op_summary: [ kernel_details.csv, '^op_summary_\d+_\d+_\d+_\d{14}\.csv$'] task_time: ^task_time_\d+_\d+_\d+_\d{14}\.json$ msprof: ^msprof_\d+_\d+_\d+_\d{14}\.json$ ge_info: ge_info.db - version: 6.3.RC2 dirs_pattern: + ASCEND_PROFILER_OUTPUT: [ op_summary ] ^PROF_\d{6}_\d{17}_\w+$: ^device_\d+$: summary: @@ -72,9 +75,7 @@ versions: msprof: Msprof ge_info: GeInfo file_attr: - op_summary: ^op_summary_\d+_\d+\.csv$ + op_summary: [ kernel_details.csv, '^op_summary_\d+_\d+\.csv$'] task_time: ^task_time_\d+_\d+\.json$ msprof: ^msprof_\d+_\d+\.json$ ge_info: ge_info.db - - diff --git a/profiler/module_visualization/prof_parse/__init__.py b/profiler/advisor/dataset/ai_core_freq/__init__.py similarity index 100% rename from profiler/module_visualization/prof_parse/__init__.py rename to profiler/advisor/dataset/ai_core_freq/__init__.py diff --git a/profiler/advisor/dataset/ai_core_freq/ai_core_freq_dataset.py b/profiler/advisor/dataset/ai_core_freq/ai_core_freq_dataset.py new file mode 100644 index 00000000000..c99baea6564 --- /dev/null +++ b/profiler/advisor/dataset/ai_core_freq/ai_core_freq_dataset.py @@ -0,0 +1,148 @@ +import json +import logging +import math +import os +import traceback + +import ijson +from tqdm import tqdm + +from profiler.advisor.common import constant as const +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.utils.utils import get_file_path_from_directory +from profiler.advisor.utils.utils import convert_to_float, parse_json_with_generator +from profiler.advisor.dataset.profiling.device_info import DeviceInfoParser +from profiler.advisor.config.config import Config + +logger = logging.getLogger() + + +class AICoreFreqDataset: + + def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None: + + self._profiler_step = [] + self._ai_core_ops = [] + self._ai_core_freq: [TimelineEvent] = [] + self._previous_freq_index = -1 + + self.timeline_dir = collection_path + self.timeline_data_list = get_file_path_from_directory(collection_path, + lambda file: file.endswith("trace_view.json")) + + self.step = kwargs.get("step") + self.op_freq = {} + info = DeviceInfoParser(collection_path) + info.parse_data() + if not Config().get_config("aic_frequency"): + return + if self.parse(): + key = self.get_key() + if key not in data: + data[key] = [] + data[key].append(self) + + @property + def profiler_step(self): + return self._profiler_step + + @property + def ai_core_freq(self): + return self._ai_core_freq + + @property + def ai_core_ops(self): + return self._ai_core_ops + + @classmethod + def get_key(cls): + """ + get key of dataset + :return: key + """ + return cls.__module__.rsplit('.', maxsplit=1)[-1] + + def parse(self): + + if len(self.timeline_data_list) == 0: + logger.warning("Please ensure trace_view.json in %s, skip timeline analysis.", self.timeline_dir) + return False + + if len(self.timeline_data_list) > 1: + logger.warning("Found multiple trace_view.json in %s, load the file of device 0 for analysis .", + self.timeline_dir) + + _ = parse_json_with_generator(sorted(self.timeline_data_list)[0], self._add_event) + + target_ai_core_ops = self._get_target_ai_core_ops() + self._get_op_frequency(target_ai_core_ops) + return True + + def _add_profiler_step(self, event): + if event.name.startswith("ProfilerStep"): + self._profiler_step.append(event) + + def _add_ai_core_ops(self, event): + if event.args.get("Task Type") in ["MIX_AIC", "AI_CORE"]: + self._ai_core_ops.append(event) + + def _add_ai_core_freq(self, event): + if event.name == "AI Core Freq": + if self._previous_freq_index != -1: + self._ai_core_freq[self._previous_freq_index]["end"] = event.get("ts", float(math.inf)) + self._previous_freq_index += 1 + event.setdefault("end", float(math.inf)) + self._ai_core_freq.append(event) + + def _add_event(self, index, event): + event["dataset_index"] = index + if not isinstance(event, TimelineEvent): + event = TimelineEvent(event) + + self._add_profiler_step(event) + self._add_ai_core_ops(event) + self._add_ai_core_freq(event) + + return True + + def _get_target_ai_core_ops(self): + target_ai_core_ops = [] + if not self.step or f"ProfilerStep#{self.step}" not in [event.name for event in self._profiler_step]: + target_ai_core_ops = self._ai_core_ops + else: + for step_event in self._profiler_step: + if step_event.name != f"ProfilerStep#{self.step}": + continue + + for ai_core_op_event in self._ai_core_ops: + if step_event.ts_include(ai_core_op_event): + target_ai_core_ops.append(ai_core_op_event) + target_ai_core_ops = sorted(target_ai_core_ops, key=lambda x: float(x.ts)) + return target_ai_core_ops + + def _get_op_frequency(self, ai_core_ops): + ai_core_freq = sorted(self._ai_core_freq, key=lambda x: float(x.ts)) + + op_index, freq_index = 0, 0 + while op_index < len(ai_core_ops) and freq_index < len(ai_core_freq): + op_event = ai_core_ops[op_index] + op_end_time = convert_to_float(op_event.ts) + convert_to_float(op_event.dur) + op_freq_list = [] + while freq_index < len(ai_core_freq): + freq_event = ai_core_freq[freq_index] + if convert_to_float(freq_event.end) < op_end_time: + op_freq_list.append(convert_to_float(freq_event.args.MHz)) + freq_index += 1 + continue + elif convert_to_float(freq_event.ts) < op_end_time: + if op_event.name not in self.op_freq: + self.op_freq[op_event.name] = {"count": 0, "dur": 0, "freq_list": []} + self.op_freq[op_event.name]["count"] += 1 + self.op_freq[op_event.name]["dur"] += convert_to_float(op_event.dur) + op_freq_list.append(convert_to_float(freq_event.args.MHz)) + self.op_freq[op_event.name]["freq_list"].append(min(op_freq_list)) + break + else: + break + + op_index += 1 diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py index 09fda2d4dcf..e1163f1cdd8 100644 --- a/profiler/advisor/dataset/cluster/cluster_dataset.py +++ b/profiler/advisor/dataset/cluster/cluster_dataset.py @@ -25,9 +25,9 @@ class ClusterDataset(Dataset): """ for file in os.listdir(self.collection_path): if file == 'cluster_analysis_output': - print("[INFO]Cluster has been analyzed " - "because of the existence of cluster analysis output directory.") - print("[INFO]Skip Cluster analyze backend.") + logger.info("[INFO]Cluster has been analyzed " + "because of the existence of cluster analysis output directory.") + logger.info("[INFO]Skip Cluster analyze backend.") return True return False @@ -62,7 +62,7 @@ class ClusterDataset(Dataset): @singleton -class ClusterStepTraceTimeDataSet(ClusterDataset): +class ClusterStepTraceTimeDataset(ClusterDataset): RANK = "rank" def __init__(self, collection_path: str, data: dict, **kwargs): @@ -77,10 +77,10 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): print("捕获到异常:", e) self._step_dict = None return False - self._step_dict = self.formate_data(step_data) + self._step_dict = self.format_data(step_data) return True - def formate_data(self, step_data: list): + def format_data(self, step_data: list): step_dict = defaultdict(lambda: [0, 0, 0]) for step_bean in step_data: if step_bean.type == self.RANK: @@ -94,7 +94,7 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): @singleton -class ClusterCommunicationDataSet(ClusterDataset): +class ClusterCommunicationDataset(ClusterDataset): RDMA_TIME_MS = "RDMA time(ms)" RDMA_SIZE_MB = "RDMA size(mb)" SDMA_TIME_MS = "SDMA time(ms)" diff --git a/profiler/advisor/dataset/profiling/device_info.py b/profiler/advisor/dataset/profiling/device_info.py index b58930777f9..110cd0794c6 100644 --- a/profiler/advisor/dataset/profiling/device_info.py +++ b/profiler/advisor/dataset/profiling/device_info.py @@ -54,6 +54,8 @@ class DeviceInfoParser: config.set_config("device_id", device_info["id"]) if "aiv_num" in device_info: config.set_config("aiv_num", device_info["aiv_num"]) + if "aic_frequency" in device_info: + config.set_config("aic_frequency", device_info["aic_frequency"]) if "ai_core_num" in device_info: config.set_config("ai_core_num", device_info["ai_core_num"]) return True diff --git a/profiler/advisor/dataset/profiling/profiling_dataset.py b/profiler/advisor/dataset/profiling/profiling_dataset.py index 46d4a4fe8b1..ebd90951abf 100644 --- a/profiler/advisor/dataset/profiling/profiling_dataset.py +++ b/profiler/advisor/dataset/profiling/profiling_dataset.py @@ -10,6 +10,7 @@ from profiler.advisor.common.profiling.tasktime import TaskTime from profiler.advisor.dataset.dataset import Dataset from profiler.advisor.dataset.profiling.device_info import DeviceInfoParser from profiler.advisor.utils.utils import join_prof_path +from profiler.cluster_analyse.common_func.file_manager import FileManager logger = logging.getLogger() @@ -42,14 +43,21 @@ class ProfilingDataset(Dataset): self.build_from_pattern(value, join_prof_path(current_path, key)) elif isinstance(dirs_pattern, list): for item in dirs_pattern: + if hasattr(self, item) and getattr(self, item): + # 避免重复构建kernel_details.csv, op_summary.csv的数据对象 + continue + file_pattern_list = self.current_version_pattern.get('file_attr').get(item) data_class = globals()[self.current_version_pattern.get('class_attr').get(item)] - data_class.FILE_PATTERN = self.current_version_pattern.get('file_attr').get(item) + if not hasattr(data_class, "file_pattern_list"): + continue + setattr(data_class, "file_pattern_list", self.current_version_pattern.get('file_attr').get(item)) data_object = data_class(current_path) is_success = data_object.parse_data() if is_success: setattr(self, item, data_object) else: - logger.warning("Skip parse %s from local path %s", self.current_version_pattern.get('class_attr').get(item), current_path) + logger.info("Skip parse %s with file pattern %s from local path %s", + self.current_version_pattern.get('class_attr').get(item), file_pattern_list, current_path) else: logger.warning(f"Unsupported arguments : %s to build %s", dirs_pattern, self.__class__.__name__) @@ -69,8 +77,7 @@ class ProfilingDataset(Dataset): logger.warning("Skip parse profiling dataset, because %s does not exist.", config_path) return [] - with open(config_path, 'r') as f: - patterns = yaml.safe_load(f) + patterns = FileManager.read_yaml_file(config_path) return patterns diff --git a/profiler/advisor/dataset/profiling/profiling_parser.py b/profiler/advisor/dataset/profiling/profiling_parser.py index bb4caeb29e5..51996617c2b 100644 --- a/profiler/advisor/dataset/profiling/profiling_parser.py +++ b/profiler/advisor/dataset/profiling/profiling_parser.py @@ -12,10 +12,10 @@ class ProfilingParser: """ profiling """ - FILE_PATTERN = "" FILE_PATTERN_MSG = "" FILE_INFO = "" - FILE_PATH = "" + + file_pattern_list = [] def __init__(self, path: str) -> None: self._path = path @@ -37,15 +37,20 @@ class ProfilingParser: return False def _parse_from_file(self): - file_list = get_file_path_from_directory(self._path, self.file_match_func(self.FILE_PATTERN)) - if not file_list: - return False - ## get last file - file = file_list[-1] - self.FILE_PATH = file - if len(file_list) > 1: - logger.warning("Multiple copies of %s were found, use %s", self.FILE_INFO, file) - return self.parse_from_file(file) + + if not isinstance(self.file_pattern_list, list): + self.file_pattern_list = [self.file_pattern_list] + + for file_pattern in self.file_pattern_list: + file_list = get_file_path_from_directory(self._path, self.file_match_func(file_pattern)) + if not file_list: + continue + ## get last file + target_file = file_list[-1] + if len(file_list) > 1: + logger.warning("Multiple copies of %s were found, use %s", self.FILE_INFO, target_file) + return self.parse_from_file(target_file) + return False @staticmethod def get_float(data) -> float: diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index 94b6fdfef78..1504e65f54f 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -1,14 +1,17 @@ +import json import logging -from typing import List +import os +from typing import List, Any +import traceback import ijson -from profiler.advisor.dataset.dataset import Dataset from tqdm import tqdm +import yaml from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent -from profiler.advisor.utils.utils import get_file_path_from_directory -from profiler.advisor.utils.utils import singleton +from profiler.advisor.utils.utils import get_file_path_from_directory, check_path_valid, singleton +from profiler.cluster_analyse.common_func.file_manager import FileManager logger = logging.getLogger() @@ -38,37 +41,76 @@ class OpCompileCollector: self._total_op_compile_time = 0.0 +class SynchronizeStreamCollector: + + def __init__(self): + self._synchronize_stream_count = 0 + self._slow_synchronize_stream = [] + self.rule = SynchronizeStreamCollector._load_rule() + + @property + def total_count(self): + return self._synchronize_stream_count + + @property + def slow_synchronize_stream(self): + return self._slow_synchronize_stream + + @staticmethod + def _load_rule(): + sync_stream_rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "rules", + "synchronize.yaml") + + sync_stream_rule = FileManager.read_yaml_file(sync_stream_rule_path) + return sync_stream_rule + + def update_sync_stream_count(self): + self._synchronize_stream_count += 1 + + def append_slow_sync_stream(self, event): + if float(event.dur) / 1000 >= self.rule.get("slow_synchronize_threshold", 10): + self._slow_synchronize_stream.append(event) + + def unset(self): + self._synchronize_stream_count = 0 + self._slow_synchronize_stream = [] + + @singleton -class TimelineEventDataset(Dataset): +class TimelineEventDataset: - def __init__(self, collection_path, data: dict, **kwargs) -> None: + def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None: self._ops_with_task_type = {} self._ops_with_stack = {} self._ops_compile = OpCompileCollector() self._torch_to_npu = {} self._acl_to_npu = set() - self._aten: List[str] = [] - self._optimizer: List[str] = [] + self._aten: List[Any] = [] + self._optimizer: List[Any] = [] + self._dataloader: List[Any] = [] + self._sync_batchnorm: List[Any] = [] + self._synchronize_stream = SynchronizeStreamCollector() self.timeline_dir = collection_path - self.timeline_data_list = get_file_path_from_directory(collection_path, lambda file: file.endswith("trace_view.json")) + self.timeline_data_list = get_file_path_from_directory(collection_path, + lambda file: file.endswith("trace_view.json")) self.dataset_len = None self.analysis_mode = kwargs.get("analysis_mode") self.task_type = kwargs.get("task_type") - self.cann_version = kwargs.get("cann_version") - self.torch_version = kwargs.get("torch_version") - if self.analysis_mode in ["fusion_ops", "all"]: - logger.info("Load fusion operators database for cann version '%s' and torch version '%s'", - self.cann_version, self.torch_version) + if not build_dataset: + return - super().__init__(collection_path, data) + if self.parse(): + key = self.get_key() + if key not in data: + data[key] = [] + data[key].append(self) if self.analysis_mode in ["op_stack", "all"]: self._task_op_names = list(set([event_key.split("-")[0] for event_key in self._ops_with_task_type.keys()])) self._post_process() - @property def ops_with_stack(self): return self._ops_with_stack @@ -101,36 +143,60 @@ class TimelineEventDataset(Dataset): def aten(self): return self._aten - def _parse(self): + @property + def dataloader(self): + return self._dataloader + + @property + def sync_batchnorm(self): + return self._sync_batchnorm + + @property + def synchronize_stream(self): + return self._synchronize_stream + + @classmethod + def get_key(cls): + """ + get key of dataset + :return: key + """ + return cls.__module__.rsplit('.', maxsplit=1)[-1] + + def parse(self): if len(self.timeline_data_list) == 0: logger.warning("Please ensure trace_view.json in %s, skip timeline analysis.", self.timeline_dir) return False if len(self.timeline_data_list) > 1: - logger.warning("Please ensure only one trace_view.json in %s, there will analyze first timeline profiling data.", self.timeline_dir) - self.timeline_data_list = [self.timeline_data_list[0]] + logger.warning("Found multiple trace_view.json in %s, load the file of device 0 for analysis .", + self.timeline_dir) result = self.parse_data_with_generator(self._add_event) if not self.dataset_len: self.dataset_len = len(result) - return True def parse_data_with_generator(self, func): result = [] + timeline_data_path = sorted(self.timeline_data_list)[0] + if not check_path_valid(timeline_data_path): + return result + try: - with open(self.timeline_data_list[0], "r") as f: + with open(timeline_data_path, "r") as f: for i, event in tqdm(enumerate(ijson.items(f, "item")), leave=False, ncols=100, desc="Building dataset for timeline analysis", total=self.dataset_len): func_res = func(index=i, event=event) if func_res is not None: result.append(func_res) - except Exception as e: - logger.warning("Error %s while parsing file %s, continue to timeline analysis", e, - self.timeline_data_list[0]) + + except Exception: + logger.warning("Error %s while parsing file %s, continue to timeline analysis", traceback.format_exc(), + timeline_data_path) return result def _add_ops_with_task_type(self, event): @@ -168,12 +234,40 @@ class TimelineEventDataset(Dataset): "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur })) + def _add_dataloader(self, event: TimelineEvent): + if "dataloader" in event.name.lower(): + self._dataloader.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur, + "stack": event.args.get("Call stack") + })) + + def _add_sync_batchnorm(self, event: TimelineEvent): + if event.name.lower() == "syncbatchnorm": + self._sync_batchnorm.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur + })) + + def _add_synchronize(self, event: TimelineEvent): + if event.name.startswith(const.SYNC_STREAM): + self._synchronize.append(TimelineEvent({ + "name": event.name, "ts": event.ts, "dur": event.dur + })) + + def _add_specific_operator(self, event): + # for analysis of operator aclOpCompile, enable jit_compILE=False + self._add_op_compile(event) + # for analysis of slow dataloader.__next__ + self._add_dataloader(event) + # for analysis of syncBatchNorm operator, prompt users to replace source code of torch_npu's syncbn + self._add_sync_batchnorm(event) + def _add_event(self, index, event): event["dataset_index"] = index if not isinstance(event, TimelineEvent): event = TimelineEvent(event) - self._add_op_compile(event) + self._add_specific_operator(event) + if self.analysis_mode == "fusion_ops": self._add_event_for_fusion_ops(event) elif self.analysis_mode == "op_stack": @@ -189,6 +283,10 @@ class TimelineEventDataset(Dataset): self._add_aten(event) return + # 检查cann层同步操作,根据时间窗口索引到host侧的aten算子并给出堆栈 + if event.name.startswith(const.SYNC_STREAM): + self._add_aten(event) + if event.name.startswith(f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"): self._add_optimizer(event) return @@ -214,7 +312,18 @@ class TimelineEventDataset(Dataset): # eliminate sub aten operator of the first level aten operator by 'ts' and 'dur', # keep the first level aten operator contiguous formated_atens = [] - for aten_event in sorted(self._aten, key=lambda x: x.get("ts", -1)): - if not formated_atens or not formated_atens[-1].ts_include(aten_event): - formated_atens.append(aten_event) + for event in sorted(self._aten, key=lambda x: x.get("ts", -1)): + if event.name.startswith(const.ATEN): + if not formated_atens or not formated_atens[-1].ts_include(event): + formated_atens.append(event) + + elif event.name.startswith(const.SYNC_STREAM): + self._synchronize_stream.update_sync_stream_count() + if formated_atens[-1].ts_include(event): + # 使用aten算子的索引,用于查询堆栈 + event["dataset_index"] = formated_atens[-1].get("dataset_index") + self._synchronize_stream.append_slow_sync_stream(event) + + else: + continue self._aten = formated_atens diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index 8ea7c9e0fc2..3984fa8f34f 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -1,6 +1,7 @@ import os import logging from typing import List, Dict +from collections import defaultdict from jinja2 import Environment, FileSystemLoader from profiler.advisor.common import constant @@ -15,7 +16,7 @@ logger = logging.getLogger() class HTMLRender: def __init__(self): self.html = "" - self.render_list: Dict[str, List] = {} + self.render_list = defaultdict(list) def render_html(self, template_dir: str = "templates", template_name: str = "main.html", template_header=constant.DEFAULT_TEMPLATE_HEADER): @@ -30,8 +31,6 @@ class HTMLRender: autoescape=True) template = env.get_template(template_name) rendered_html = template.render(**kwargs) - if key not in self.render_list: - self.render_list[key] = [] self.render_list[key].append(rendered_html) return rendered_html diff --git a/profiler/advisor/display/html/templates/ai_core_frequency.html b/profiler/advisor/display/html/templates/ai_core_frequency.html new file mode 100644 index 00000000000..d0451420373 --- /dev/null +++ b/profiler/advisor/display/html/templates/ai_core_frequency.html @@ -0,0 +1,27 @@ +{% if data|length > 0 %} +
+

AI CORE Frequency Issues

+
+ Issue: {{ desc }} +
+ Suggestion: {{ suggestion }} +

+ + + {% for header in headers %} + + {% endfor %} + + + {% for row in data %} + + {% for element in row %} + + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element|safe }}
+ +
+
+{% endif %} \ No newline at end of file diff --git a/profiler/advisor/display/html/templates/slow_dataloader.html b/profiler/advisor/display/html/templates/slow_dataloader.html new file mode 100644 index 00000000000..ae3a22f283c --- /dev/null +++ b/profiler/advisor/display/html/templates/slow_dataloader.html @@ -0,0 +1,18 @@ +
+

Slow Dataloader Issues

+
+ {{ desc }} + + + + + + {% for suggestion in suggestions %} + + + + {% endfor %} +
Suggestions
{{ loop.index }}. {{ suggestion|safe }}
+ +
+
diff --git a/profiler/advisor/display/html/templates/sync_batchnorm.html b/profiler/advisor/display/html/templates/sync_batchnorm.html new file mode 100644 index 00000000000..0a4cb3e7302 --- /dev/null +++ b/profiler/advisor/display/html/templates/sync_batchnorm.html @@ -0,0 +1,30 @@ + +
+

SyncBatchNorm Issues

+
+ {{ desc }} + + + + + {% for item in solutions %} + {% set rowloop = loop %} + {% for key, value in item.items() %} + + + + {% endfor %} + {% endfor %} +
Suggestions
{{ rowloop.index }}. {{ value.desc }}
+ + More efficient code of syncbn forward as follows: + {% for item in solutions %} + {% for key, value in item.items() %} + {% if 'efficient_code' in value %} +
{{ value.efficient_code|safe }}
+ {% endif %} + {% endfor %} + {% endfor %} + +
+
diff --git a/profiler/advisor/display/html/templates/synchronize_stream.html b/profiler/advisor/display/html/templates/synchronize_stream.html new file mode 100644 index 00000000000..fd95b486151 --- /dev/null +++ b/profiler/advisor/display/html/templates/synchronize_stream.html @@ -0,0 +1,57 @@ +
+

Synchronize Stream Issues

+
+ {{ desc }} + + + + + + + {% for item in solutions %} + {% set rowloop = loop %} + {% for key, value in item.items() %} + + + + + {% endfor %} + {% endfor %} +
Suggestions
{{ rowloop.index }}. {{ value.desc }}
+ +
+ {% if not empty_stacks %} + Please click on the collapsible box below to view the detailed code stack that triggers synchronizeStream + {% elif not framework_black_list %} + Suggestion: + These operators have no code stack. If parameter 'with_stack=False' was set while profiling, please refer to + Ascend PyTorch Profiler to set + 'with_stack=True'. Otherwise, ignore following affinity APIs due to backward broadcast lack of stack. + {% endif %} + + {% for api_name, stacks in result.items() %} + + {% if empty_stacks %} +
{{api_name|safe}}
+ + {% elif stacks | length > 0 %} + +
{{api_name|safe}}
+
+
+ {% for stack in stacks %} +
No.{{loop.index|safe}} code stack, called {{stack[1]|safe}} times
+ + {% endfor %} +
+
+ {% endif %} + + {% endfor %} + +
+ +
+
diff --git a/profiler/advisor/img/overall.png b/profiler/advisor/img/overall.png index 6d5da107a3f7f6c8c655922bd80d193708fe71aa..1883d4c97388b1cfb774d05fc9e0d368d0c66901 100644 GIT binary patch literal 49616 zcmd42Rajlk(lrVz$ijjINpN=w1PSi$?yztI1b26Wy9ak)!QFxc2<`+6?j*Q#X7;!D z`(^*n|D2n1b8hGC*;2E*yK0P4;YtdUXvp};FfcG^(o$k7FfcDjVPHV)U=Z-cihKzH z28I+yT1;5Y9rmc>QI$Zmm6_*xx?Dwgwq=SkzE}nOjcO15q26oN-|5{p1ovujcG^l7 zjHy9oqmx(e&WT?`;?Q}`k08}B*wvM(y5pqJQq0BANQQs$uh)te|vg z=6NKktE+o{I=y)T5`qE4z>>ni`TI{}-o!300xvHrYOWlq<@eOoRptG7xzXD@UDJ7YHo~&Ivom$`KOMqgz<`9h zhrr)QM}2O$Qx&`JtQ?GtwhwD^o{#O;{GJ~L4cc!$xTQYdTD&MS4d`AP+HGoT?EJBC zGv3gUT+`*-SYO`^efvD$HTHa2SUU%;%gybVflo8kZI1_l<7Olej%c*6^- z7fhT1`s)P|T=^X_<8OF0o`!Cw{~r0EV<-zS6N#Iv@4~7prl%$HpC9&bcW}ng($l)R z%CzHjY$0iE@>^cy?t4~`p@lX!H9A`b=>Df2uo8Pr5X$-aT4m+d)6?~F6%&{d?+vaz z>_1&ug~7mS9783C{`tmV7BlVn&``7U2_KU&H3M%H^uLV*fW}NplIj23xB}C^+p)o8 z^8G*4&&3757lmZK@whwAd4}3~8*qDc?sEBxe1kn-bc6pz`p-Jn>EQIppikR%c3thh z#g!{hhhDLoY&qFs-Sm+dzW=*gM)7VGkSr`JCL=tG9xUvC1_}%HkA~N{d8pbiveVS) z^vTHZtnW(!XKqrI-EJoXk9J6W+~*XlAJPp_{1l&^Gva>T3i5l>d+TT1*LP{doDAn^ z?nqh&%VKJI4!uM(HARK{&!2Zkf&GD8X=r$RuCg;3_KZ#bG~a4hq$w6~`2)SUOaJa@ z*zSSyLeVe7AiZl=ezb@Y^XYCyk|AC2Id4?)q03`;Ek&dFDa5ve9!INVQ#QS;or3cI z>&(<}*JF*L*T|72(?3SAPXjM#KrR)l7=4x~II_gEpY|2^ll|`f(q1RHBzp@#bUg{i za)tGkJZL_V!&l3|Ym8TwtQFH#KNybIV4TU^-1zxKAH4Pc&(vXHNg+s}MjS1_G#$*8 zhhPo10GAy_!(o)bjed(~)mJ^_Piv1qpSn;`B70l&n%T7Ip`V`JPbG6knHYWAYT4aR z(VyXb%#bqUkp7t?DJeuZKwTj4auAE7Pz>5?$FG08vH7kGH{n~WOeaJ{9?5y~a*#!H zN8pL(7wli#OM?vfu&4|1@cZDtU5>k4By>z{_3YHA%mejoPN zp630$LT>yX*TSAYmMvqEt(0^4)pU(2O3@+*nBk}Q=p z#QAVOv6zti`Udu&-wqStLjf@fXgKBkDf}NV13Y~O8x~xd(Hq+v@b7~G29}E)CSY5l zB47TWjUXfsY=m;U2U@~^)^AS%!qBp$Q^IXU%PTV1+yu-M(r@umOe-O^>4bl>yc=LtBtmlqd-S2>T5{v`jb6$WH0 zzW#7~K8vd*Xk%x$<(SH>@3~);x{S=Q^uI@;`>O!Mr?b?qXOid55tc5!zYj5Vu0Zwm zTi`bH{{0mlxZ8jmZagI=+xJqPC+i8g`WF43-5&mT_X~CJgC7QuYODM`Yo+I0;ltz+w8pS@gs1#;o;!|JQ*&LA;9tJq{2~DOCD`I)h^zaFW4}0r9yi-Uw*~_vs!n`zr&k*4F7KWdp}Ia zoG+8)|BRo@0~Ue^L;n8j#&D!iMud8v*4ITau>Wc3DggsR0ipcmjkpvr0cNbZt{T&& z1pmxXi3s?Z6#AFXLs&tx7+OsdlK=I;GASTN_7yUqkzRZX20RLTYv-B%D$(gmaTuE~ zxQckY2<6QS&2yZiI^{wTZ9=U2xdnrfQ5{DwH2#_fJe%Jc{nsl7N@i;l9>>DgWuO4~ z05b}>N~)%Ly`cFelt%Z_jeA>`(ptN{_N&|mwSey0`1W7;$+ADJnObDe2Z^}IU>ci; zGQyT{m9~|Bx~T5VslWrP8<7BO$=Y(cc-0qS!$OO&UoB6H!>CP$z;nrBLR085v^W#+ ziUzKhh^&W>IWF4g450nixNB}+!pT0OyA6m{MaXcta4FFk(3E8ZCo}_VmW2(D$kVot z+~Ll=^QE9agN#U=^;)Yjf2?WPVjd-NTMmX<*r2v1w$c?!77dw~PuG>j0W*Rb3CJK! zEh{s}lPM~2-6<+^_L6w8O5%99%fLnb$0UnV21)ootlj=BdE#8hs|UA6+k5R8Ui}I! zYo541gBYA%WgbN9jW)l^cBJ!iwH!-IYbWoe?gD{sB$~ zT3KeZ+plBi=uNI2mV1xgRGS0LR{beic2kjhvsJDz5jug51@9I~)Mr8sIvpZja^_pC zdMCWcB@ON?E*A}YSX%CVBJ7{r6uT#xzOU8aDM*(y2w9x&|)2U854~XPU10E6b3Np5vx4NtY2RBJEuDYU-4Lf-H6DQVNzA{ zg6Z&2EHhjmQ}#UNO8aPA)t7vx@KhEmH2Vx@f9OjQevo56NtSDJ9$#a(51HYXGFpF_ ztJBpdoARY-B6-0Yrw%%V4CI3%9n879_&n*?5AoNSQ+EUCh7bJW8r#;jUzCGEvbEpF z%*~IgFg%=03&31f4O$T^GgLv3w5bFh?@S%vGFfc&k3>hGGA7;6CSOOF_JSduKd-QW zpg*;xje~1J+N8Fy^Y_fRcL)9|{Drs6MPr)#v=zwapL|T5Y^A_I6Yjcyi^-sh%k_Vo z)LI+DAp3%SOjd$rjBiLfm7~*65<5X_=d+oPNeB9pPK-52h$R2ZG%}y=eRBZj7wQ0$ z@DC20LYg$0F9gU7<)l^9XTu!GY~?`leVh5~5gW80UffIef$Ns*druDBV<(n`C|9Qk z1`s-52kU1jZuuBVJD=N@u*_U9B)#TP{IpcePTfQfh0C|ov!k$6(xi0?)yN)F<!ME2oRlV#+LK07Na0rMj~UIH?{~L)gD=;QZ74T{OVWywS6OOz)E=J#F0(Hyd?~_! zyY>N<^pkqaF8!}|SY5S(tXzF!Lu9z%mS*!p(+(B;mp+f9mw^}YyhMlJ%F+nAal|rv zn^tC}CpS8U&eJhjGpg5mwh^Xk)4KDyR9UYsA=pebllD2}5Fwm~?);ag*V2Zh6W3`1 zRBt?GzWAB$JrnRy*9CZ>_B5N(4e}+d>!i2Ln!KvHxn)Z-QJg$TFJ3-_P}+acq0H|7 z#UCoTc&dEe@4CWBO|UIz`m*h=_Y-|G*Rlb~+TvEj!rwW17iC;>$tQ2&aTJte*tVn) zLx~69ny56tP`5llt&Wp4!5Ao>iX!^u_wH@)$)LB+UAXjFPVp&9`(19 zu`m(46`g`u7Sx8g#zXEUCQ(W@p~1~6$yEGEi=ywzBULw{8ZHldLNbJwDIITz zbZnh0&TUh-Jm7fmjOmQR#YS0gJzH7CbP({GZbf`uj;fS$!>%NUr% zI8Q1t*=PwDi>0~3BPu=yv5M8m5`whiJ%ki+sHOP~&;yoxsXR6K%p3~T?+V{NApH($lGw>s6)eL?)+SMIP$R1&u)|j_I=fmDhJ3mG zTz-{Qxup1@%vzPctUM}KMEII!<~y@-n$<|lRHO#UKrG^3dEqb2mo&AN!Jmt;%4nF% z>gb~w`JXyH4XYAjRX&qLlMLzHv>RN6bCJ&6+Q1*$%E#8ewY)DCe4VJP2$}3rL037J z&#orrn#M8uP4V&z{eqe2+h0@@g*D|#5#3YENmbN*w7g(_x*JQen4x%vO{~R`-F#m~;VbOGo6%9gH?tEi)77e- z4OK}}>UN5YpQ5frsAXy&4&42`&PQhW!;M}};xf*y;{?7@7B{m^6l*B!&hH`MZXePq zQW9=1z|vX*$2ETvsG`M*=wC(El_DlP)_ljCfdh+%ryZUF)-du)WY!jlV3=gnFiJ*C z@@B7!;|Xefstnr;RKm#b%iH`I9k8&~85_1YGlBa!n6@kqvL5HxI=km}9ccR&FH2|S zSZo|!>W$i0+8q6$MB{M)jY#fzwFNm&cemvYLwy5&1j zY`HRDy6IPf)cL$E<-dD>&>m*>WA^WUUDVmawY{L~4WINb0~GcRU^=w zp`1fC!Jm5TCVkC0Ab=yIS3DRqECF>lLf43ehA?!!ol#MA%d#uk$Wyv^X+186cD!{w znKD&4K1;&VcId6C3>DdBTns8g6fwF}k&4rp_Q_LHlvxO`Vg9x_Kd_|%H)reX!MFNd z0aKb7Bf36pC2k*hIyn{i3l{qU~Ye0@CIkVb6tg4&OJLbwOHw zLr<}Iw~^B9-<{k zG3OAM3eQ8mg?*4k7o>}gGkZVCGi=#PjEAyET2Py$ZjM$i9lVgohVv}XA$q^2&=1O6 zP@8lRx473faa0YU@~J=VSu%L-sjBsiB8Fv`1E5y$R|+~RYp=a_Rm2t;M8~D|>`KC( z#xi-uzZuY%V$|$mlhtM1S*B%(jF_;-LoTC_rCrCIJ!k_}gUhp3U;}{7|FKU%9O(9S ztaM4@xL!nfrZW4OkyESTd*lO@`mV&dm$bhg#tLLF%2LqhXEv={DD=-NVz*y1x0yRM zo3RvQq7ORzwt7G!H2i-&O}JjDBB)`)rF_S^1Uynl`zyw*&tj!9M0ZT${Gj1s!++gk zFJMD9s|XaimgpGKP45)}O>*on0c%HT@an4#6j(GFZ_wS`4O|IEgzbeSlYAyu({7nE zP_hni@h{h0QY&l{1WEW#gsSCvrZQb0)p?>ww=>MYBH*Z75mjivsVbbY*R?e+J=}VZ!Ln zSgtR@z$*2_5Iz)2{3{s~hWx9ysuRv;R}v5;^4`W%iV39t5=raY>|etHy(~-4$>^Fg zwJ};Hbbto9p~?2*!Z9P z)5o-`UVJX0ZE7lhRU7QjG*+vLW{pxlgW+v~^YPEBfprJ_1H)(xN_=DhGV(HP*70IQ zbw2Am83J1-^>Rw6ZFI~A`d?Upt3M$?R*2{en)`&P6{Jvi9l6|30SimE)nccPaPn_24=MHZUA?qx6ajvO}vF}V+gre zq~u2M9(#hT0=s$Ol;Vr{G5ksD!-zgBqd@!LxYi0Y3F8pF?s=mQz~iA=N+Ui%zs=|q zeD~mm4ee>oC3~eJl|H^qV$eukv76u5pNK1VCWkuUMy;@-5hswko-O1s12MpiDCOIb zIWX|jchbh{mkY+$=L=3`Vy6l4axxe73A6wMRW+?xG+iISox~dShjBAiwcHT zm{IcOt>nAK&kfup64`Te6Rs^i=u2aioYfc$#wME!6;`2j=| zW3et_kL=3&qLOErZ55+fYcFwNMH|%=U}82M_kSj$&EVWV$kT*PZ$w4%ltR}e<&W+5 zv#{l|d?tRV(JVY!-6{R}^qon#JC1(&PR8@B<0%H$&sxP~^s>g*wF&M~fKzUwT+UgQ z8al3g{DQEgIV4rbi^Ot8Bqrv4#O82u0{P+rKF?~Vc2i;2x2J6f&TU&H#K$WSz%>ayUcKJASzA$?pfzqb2Q4Dw%GLm4o94(}YU!cqWRZ2>9J>^hIGmlv>kfQBA{b zP_~2;R%lO3_$p2CxX{>@e-k-uc^cZE$|S#e5%^m@I(^?yUecuu3p%na$0D1q5SlDa z^Uh&M!S7m~=MI^~-13U(SpyO9Hl+eJDVV&|*e|WI;?ky&>`W3{yFH(7TN$_sisD^bA9R9tAr%HikkM#oxX2*k%E>~}hCxd{zsCT`Jr@R^Z=@E(N^am^Ed z8oHog42_xQ|Ckr!X#FK^_<{V3V~fm(Uy3dm_Nm>*6dJ6qEko{c!8PA2t>FPvio{$M z2iy0KG7FOZd>Ria+n#+nH6bg>*9yTqc|dbVP7+u-RP6-~{iSUuFkX&{bgY-{b z5(<7BlZ#PbNZMD#b=nF;xV7&OR=3!`(nvvtm4=3~R&8qO7BI~=+BRQ|I{Ie>uv=Yb z8abS`_}1{2MnK@jFIh^)s!D2H;V531ohPVn^jg71EdZ`+Ilxyn9ak?0_O!PIZIkN@7qomR6E|>xk8LD^(9AOr}Ro^5?=s zjtqgf#X1Qnd7n4_qXj64e`_=tB_Uh2#wV@d9XC0pSx{8SuQhBVTGje~j)c0^=jp~G zA$nv^TeaQ_fnq+|JWiprx^k~>N6z^^=^HXby=PLbqI9TlT!zF{=de}QiF}N1=9|v6 z&ssXWj)(6wd&Ts63pRM&?@W~UsYh5O#xJ^dVn>8gY<+4a!3J0=c+$mcQ>nu~BH#SD zS@r6(Mg;s@Td5^%7SUl%$4PTFmb3aVxA(-9l}tF_Fo&17;=4$4j(wu96#h&K-r_@aiRf<#TWH4cgetcUiMY=?BOb7pe@@8-o=+tgymyqEMHB;ol9Qj zXY0qaJHxvbdwoi2{qo$|gVi{4-O8v-)D;_t)}uz$l$XysG5%z_Gezl?{>pF+_oS;^ zc0>?s9+i`p^i`>gD;ZC%1wo;L@@?1M(u$zNatcT&eMXNsu~KmP0n5)x+Wl~HH=#re znoY@vs?*xst7V}Z%>-smB*u(N&4m}L^AnTL_Zns-BC|ch0akF!RM5^ny9pq1!tcRC z;FKp(icNL&Q(D`qjDW}wJT8F~ZD-_Foq?n+yIZAZup;`n!CF*Ms%r6$_y-3)65~qd zXV+M40z<*mk4nc9EEG_-uU$rU>Jb9`QN@)D%xFysujVx(&T@dmBwWUG)Re=f23Ge< z9LDO9_D4{IZ-MASkob?LCvi4Cnj@?EM%Q8%LZT7J>(>4ou><25(W#pzf&gGZuBq{^ zxpL(QqbU_EI0CZXv2uGRGB?%_5Ra9dDdQSZ?5j_-xRd9eVKh}N(1yg=d|EPSmW4R< z()Es6f7r&(gR<&%AX@Ji6Xwlcz&7M6qcG-Jp{FP=c+2?VJQP4p23UwDOL}5sV*6>U z2@9sg#I}$SS14R^`L!Hlr?$p3>I5)NBQT5u~jZx}zU<;ye_(-(C+RBW-jyUAs)jUv8yvUTIArV zHrE*z-Ey8bE4_6CUwfY&{@>mucgbq4TxiHp;6~rr9tT{95w)1(rNsP6qCmq4m=WIH3>% zF6~VxEpk((g-b(|@=pzO$c%U8FNb!gP1w%vPiyIZ^SxJY?|Eo5nqrfa%$RZ`FE^Yo zSqu^%DV#Y&)NKphUOtY<2|L*42{5ZVj!=&gZ>d;y6tf`HZ(Fyhc_{`mt=8QN=%SV% zy?TGjj}TyHdHxks>q5L>>OS$Iir5#kl%@?7lsL4t5Hjh zEe;pTZ85YO*%0YXsDRfR`TJMx-MCG<3$<4P^eMvoS5c(UWm_ZkCUoD4uLCvB`w-dw zIFCyp!mmiFXIDjdC-z(=@N(q-jRO!z{hz#@` z2C9&K2d}GVrFEDIU#=BfChHX3?O0|s$ruwv0*CUQ0;Vx-aCnnW7$^2sxA!?2^t(w& z(o0bo-w7jxiVrS5DH*+|g}uT!G};x#E-r=LtsFcNfAgvfZvsH$x|i1|6>MzZ26dz@ zl`woAc|f@l(@a{UcR@_EVm~I$0_bBn4LKO>{-iyO)8(_20lFi#Da*FvNXJ6nVMJ(3>7q`{b{+k^$5tWQ851__k@R~nfTy%i#j(EL6f zNCH`vEb*@(0sl1%8sti~D6!BK222T6R!1BA#n$V!)5O5#-es;^9|7I-Va+1YTmpeo z8v!h|f90wH>XI7~0%_%Cur4!p3P_H?JPtc3{EDSTF#khped?lGrnM}2@eC5&r;>bt zI;-lGD!R2*RVCx}OXZILTSq!E&$N~zxD23bI-v#P<{qRXGN|<~g1EF4g8(o}AV`dS zBV{Ag=uUs4`dgl17_HW^Y4FR(!{NH3KD#7)h_x+LHGirsUfg4;K-9l#Fnq@|z_H-X z50QX~Hxx?Q^eSs>8AL~X_@$cXTZ^B*VPILFwCAz;__{oZ=+X*FDVEnyG~EA|^1q9_ z8&DN>$}|ql8l7#q5BklO`C?Z+_S@=rJWGLhwUxwf_1RVR*vs5}a94#g;^MqQkg@^6 zFYgEwx^<+8KiJq%>TlLIZKI?hS4N<2j`oFajm36}i|_-i>ur{utxE{#3te(iHkw2! z=M??=XxJAFGOH3guWE~+_`97OcXI}o3WiST{aXozXuFD&Z~JJq3DfKOt>D+S;%0sO zvUwCk;w~$Vr>?gmU+y#E1-dvT%9606t3~~1V|BZsDK4i1-5!iIa(PLe& z9#y@xYG74wEH(yU#R+FyNo8)7Aqs%Jv2fsx48k-taa4xP*qWCXb>2R#Y|AH~$Cq(r zg5edSC(R-r21>s8v+FDNzZgBnLkJt=C}-EU-IB-JSF}4m3pozS!^JvKv#3t zJGYT45gR$G%>tClSZqYl%u`plpfr0nzIHP5>{DS?I2sSWbvzBdxfHNH?CZN#MDXEH zY#B+bFTwAE-0I)Omc1s+tNt=el)Z)@3-A2}`;eQ>rFiWnduN?euBz@VLJb>cWi-Jt zC3fZ(z|371a3T4`m4G7aDR=fpL$+A4-eBbqms5!p;Q6SxF=hOS zvqD{6TGZZv{_^VLAA@0vL&}08M?~W%Zf9bU4?yG7+8;*3?ofu=l_qg{S-`dpDOsJ^ z;9z`s2jcg*Ii@G$v#XzvN=|DemUVwQPNSy_vwCC3`km%&zE6AvM-6gG!UDaL%(isQ4#YDDxtr z=Px1CqYKcX*|+_f-ztL&njVDjq5P%$n9Jr{<2xh5BWOluOc#yOW8CI}%~Zr!F6heK8q`8osl7z=5T!1GO;97l^V4Oant-iXQWVOB+L7&v>iwac2&7u&dK z3YJE#iEx>OO!@7%0(zB-(h_OI$~i!)TELq za;<<)nhcmIjcWBA(+2uU_LdK(fe5oC{j(yP?J+BNN{jaczUH2?(50&tg(>RO!g%<( z4NE@m{(PeR?mMfRO8QgBhTb`!?UNzg4=Za666_hX0tp3u6J-AOIZw{qB3%7~5!3ya z**S-HeHk%9uK|{~1nH8zmJ+HV7!~*WM^G|}AjL^fKAwzFy%^-fHDvOqG!c;!0=&ex zs{1cWvmvpn<% zXk797%fhKz6g%}!ADXTjeHQ+!V*HU)$pgc#r2LAw+vqYe8%rZehVZ1$LSLEHt?H$q zhVvvMWo2nrF=*85_AKekrrEH1mYMYi*7f=08QXz;t7ef^)*{fW8o2}{rG()Jpx|jk9Z-_Gu)md zOQ%ryBI`GE>%6A>Vji@m(I*P&LS%@T@#h}-jQZs=ywj#vlBHBKKOp7rodiW%IQU|@ zze|5Fx~D+x8FQx*l|^pzTwq_XMD&SX`(1`FFUHj>q0Fy(RQQWZnX_5kBI1aQrnfY#IGa`%6PKO+L`i2{ZB47$ORJSj z9TkVRwxLaKFTG7Qn9Xy%id?}*!pX=C z=1u|IX)hTshma=+RmB0p+e!ZS-?L@_55W9x?u;$CIb59q!PDa*ug6RPqEx_o9QLF~9{ z++fda&$+*2Z@RAao`FCnV)nbI3?Q{a=Rz0w581=_Z6}{v_y5f#-L?N;O(K&FH&->O z{9i)bF|ZabZ3QLty|=$V5*)y;WJrZj3i$(}xj!N`CXBiqD8w`-3esaE)Qt+HPUcD? z_?WOzc)7RXFr~&=w3I>JAVj@LAm_4{_X4B6(0vK3=4Osm-aK6$HbD=r6otXjUU~t0 zBkZ-%^QHkvON?j;X0X)01Ymwvh6giREFd*3BLe;2!;IF&`ZHy7_!OZbs?&>quiGbs z@>zRca}oMZJaRTxhd?4-v|9OSl=`<38mm1pwC0S$Kih-?1COE_A0GHKHFCyg(Ib{G z4B=e_N`a`NHBWXIGH+fMqb9|O7?pt#@TwEOb`4lsks*~s>b6$t;`L)+l z{aVHPG$^4mDY&_o4z!A;wQt&dBIRuy{xaauX7Ea|BpdmrjIWN6`Um)eFa}& zq8+1~^Lztv30Yd3@0p#Bn<%|mlSdtvm*k$6r|t|S5^*{gP+?y*R{A++z!fxUzGqJD z@Zpr=v9szJkbAbK^cIJV;tfG-@}5y-;{Kv1qNg^-cm_w&#d?RdV@3=Pe+|-b$7IT9 z@1;-dn7`9g%EJ01**Qbdb7jQ8$m*2t2so40N4--t$SfhIcB&gWedae4=2<+j+u!r6 zX-W>p2`GAx`&gMI=U}D@*G>s_e5GtCdP>ocy>FV<&MG&^@1a4~$5c49lSRwD#LGCQ zcZeTsFK&LrRk#_th5qieA*ES|%1LjQ5SBuaR@0XGjRu?sk4KW&;DVirGS-_ki?zKj zbWeh%utg%W-$XHO^YR9Ywf15>w>am1k zhVQ*~L)1C+TSE??h4Owdig>g|2Crr9Hiz39zM56(rST~W@1u@@*I>@F;bv_&#e-8q zPXxbK5~qA|c<@TO0g~ThL5;OCU;2bA;QT3GA92uO>B24FGbV&T(l5*3KNT*;oObrL z#hFOl0Mnc$>V}TRgr7#5>S{O^7T+UlmW`=k_?^#WIAQATIvh9MnBlvxcBC-H0h6|d zQ4hI#nrIB{LFAf8Pg@qjkw>F3JVwqlH^R0Xj7Swe)Pq$l@>j%Yu4|&7MNo1_JCiL& z9)!fG+BkwSmvWUC*kGyi#W1zlQVXd>456D=JXtpUJ}(XhI5z{r50Xld2dm_r5&PX6 zFuTO}Y(~jh!Zc=etadW1Yw^${(_Y%1y`yti-EdHe4ikzOZ)T_WUwPfJg^6RP& z)=9;Zd+8mxSsEf;Nk!@X`=f&%MTSed$~f)Ms=Nla=!qM7?$JMN({xT{&D!3Cse#XQ zUSG!_=5cBoUn4CNvOV>+3k|{K)!E*+tNb~R&quM(p<<_&){VtYS+bdMYgREr9*svV zf&C7PL#qYOk&eKz@O-MvaYD{DO%T3nAdTKM{bj+g-c6-VwPeRK7lZ)*WAIV5;KSab z@v`dtAME|4O|}My>6!iaw_D1g0{-9>W}C6red>S)yfgNx+&07t_CS}wlkZb_T7US> zL!PiO#k)=PD|*bXdF+2}0yzpOyC~i4gt!-U&y~M*l-pfHS<_MBBG6HeG}aXzcZI5I z(I(FWYGDyIQ{{q_23!4#TSE%tvd49wGh#z!#=VRnazhU6^ zR>F;JXedOu6dI9r(?)xF{PYLZ9fVHPUHioy#UAWkUgrpSKG6*cgtdSviiS3{J4D%y+gGEKe2z_Jv1aft@kIX z&{iLX0n3R1#86)BPUmI3?=F%`Ev*4K|A6%M?DBqiWR=T7$9tz2M8-Gg25!(pS6tAD zuj$+Bcv$u3t_uVo8iY<`Ovc~2tf}&A>4ljd{dGlI041Qu>B1`K>m@(Avc#{|kBC*Y zIW(aDwdP8sTNe~rP*WVg8uaH(zkbb&p>c(s*RHSqsy*wWV~r)c;EIjLszBn8!|&Gv zZBKiw^6mJ}ZZpSLu?-ExEsB_We+p-4-N$dJID0HTSIq5sO?iE^ykdCO!Q}fpS;rkc zC93Ho3+iG*R{H||Lhab|58t7I^A|>j70$)ZuFadDe)W?3sa_zQ(-W@zQh4Z@t2V-6 zQiIuoij|6d6eDa`f8=K)z+ck#7E-nZJ`1XOU}5C(tdR)*R<+z z&hNTN))^E2UdM5n(S25#*Rf!4BDg0mHnDRCgpk``dRXw`gmBOZZ&ZkZlCHCbb-XYN z7{NXJ7a9$B5Lxw~u8u~z5s6irv#p{KbsSr44SBeA#ES67=@F3bXWO3gSrWA}{22un zFlb~SSL|^F2mrKj@gI}|VHQ+CNVaIE?S<`*4@%Y$V9))`LMFR^T zJ2>Za3{2^UGm1vt6GWC{($!fzH=ne-1i2?ZU+a5{) z_Lk0#`~FuLjArrE7I^3OW4?DTfF3t+|D^Jdg6I$Q+ZJwJ>;lC7a^X^&dO(}qrRrf0 z06#`;c8aX~8$WJJEZ8x}fYJS)8_W)$ji@g4A`wU^fC?6ZA(TW^dj~ckEilfVV2{dJ z_xFHX*ZOX<{X6MzwNtDY@;n!bjYCIxl@)2JF-}7D_~5_mTbi@myf`Ox98ZDrpZ9>+ zsef{(!hBme*((2dul8@PQ_RV3=|eW)d|o2KaO9v|zwj$QULGKW2V0Gzi{<$cyc`A>f;DFseT`^ z+fS}*qjvmWY-fY2A6c5vm{Nb4o*OX=k&C8pSRL+;;{h3Sr~*fq9!nzqYymc{!Ty#j z%pHy}<1w_T zKOW_p0{C_GV~p6{v~HWXkr$aqRK5QAWbv^;RJ`iu5Bktw4`)ic>o;Xvmtq9rN#M%S zA*V|x6W#x40mwEOaJ0*jXz zf5p6)zSO(ORK*XBQ$9Oh%WPyyqd@@d|Uw_`L1!6RcBE0yvsP_9UMwfB9Iq zh~X#5Y@ri=f|t>e2wQTZW$tTBkGC6)0&QldR{g=WRwBsqW@rlt$9vTBYmrM7Ujwg9)u zg@aTfA7m``7@)52LRNREPqJjyLAi!ZhZ!v_*6tW{p^k&=(@UcOctHX2P6 z+4aJ7>~#M+H0F-?AVKHM(PffAPkp!#6TordUKm}^`V2r-qD$3+Mp5Jc!)AO?3u+@k zf!cQKSp}5%1dW<~SO|hM-W)#TBA%4c zl66$2nsawr79<5xFV?!Zu+Cf;lEygIc4P60I$-O?%JRl;kf{Uplrn(yC4@)v-H7N% z{hTQ!#3!3bf9~?6{gF&J-oteNITNW56&xnW08j=Gs^Of1x1L=M9dar>{#*-TsxeO8 z3FQSqXF~PwuCDhej98iZ^fEf=o&r-lKm}HDMR2~;@=r%ykFtiuT`k6leM)7>kgc;v zuD15E`3Lr_fR*6xa_W-p+AHeoGlCb;UW7;6j0ObBYoY8g z;=!rwy)Ev?>jI=wK2L8tQX3aT?-JHuSBaqUlXGvQOQVNo=nHlG02M_DPni~^Y#gDM zJ!7ofRs@p18y7-1n&9PSR2uS!w9xxgbypt?H4_V<875}LkFb2Bhl_j9F2O>CrFLtr zf?LC&%(BQ4!-PHS&LJ}i1r>PUtW29bfu9*_OtNWqx&a?a<2V%3o{RuR zl4~I%yQ+5tFt0^rswy4um*ze$Thb(nl$%K!p&L3a`n+t|*DH!X`$qIit*o0IfY-e= z)NoZqeqAY-vyh%pYpl`yscjH1pq`e-&>B0y3Ei4IiqD^x|Jl9%2NR|`a@_?Oxe--K z0Nb^~{EAWjY!#D{GW1+qx({ECi^N!bp9NoQZDsup*Vn?~tqO~c_lU0e!cR*$}YvnLZ2uiLA`du@Rc=2zKF1~X`&OS$LM3O*Es5ntqAtJoPBBZ_4-9V z+V7_^4kk+(_(S)ijCBHv92n&w;a>W$b+ge#AdA8JUaCb2)yLQY{eNoaE-!7kC2+MA z??3?yB`pBEq}@4-qr!yANxf5gFeQi0?m$P~yEkeg0x{uz=Sofb@<&EeGK|%$l=t=c zp<4z(Wk>u75c)nGETjvS3hJ-lMo+pr%0<+VUrN3f6{{)1)xsAnJ@d~TQB~Y?cxNE! z*J`;J`Twx@mO*j7+q!R%;I6^l-3jh)!JWo~y9a{11b2cv1b3IJ%*Uv*b80DI8SX-A1}F12lHxcHfOR#F3;l&IpAp$cvpUeqq!E1xW^QBfE)Tjh2l zY|EI4k{qDsB@WN$EuDDPlVdR5AFo?Ihv?20mdg=-|O5qhaGKt{LO0Fhi-|b@mh=`W;=+wTB z6%G>zBwozs`SjMR=B`qGWO@-C11o=k{7b zy+D}1sp$JiQQZ&QTKie_WVO08{o_dApxlc8ACv!ofXTbo!5x~w0_t>9*Y8-lF07aj4;y2UYDm#J zl8%4Ab)`MEe+TgPWtV3O=<@+8r$evcdAHo1nkb+Cle*8}$&P2Zd(s{*kccyy# zxu|mUf1@k>npFQwx?dHwyaDJY5qKS0nO~iG7oBp4)+=)L{;Irr$ZsO=qUa)YyMz{| zdDPh)8Bmk2c)GeeHg)D*7pPBO;Sb1XFc`guO+H6FpfeQ7rA3+;HXq3KAw{Km%x)}+ z{?-y4UG|4@PG{}{KdLyFQ7(p4*Y?FVCK~!%X4vcJvp%>HQ`aU@`%iuAGv~ruw#Ch= zb4+7MZuqEeiGnNqoa1m^ogopvA&^+NS}SL~-cIsR>}*d^7E+^w>$e@gT}yamcm>Q? z0W%Mq5hd2q+jC}dsAz2|TimvklR~6JZ@_U8m&@;SExtQ#L5eZ+^X$&;B@K3qz4*@) zZd=+C(>Gx$od;2;JBgnIEbaAPci3Si?0(qmGI~8)e{){b-RHJF*|RWU=8>3H2h`e` zD%E(Z4tzpzC~Ijdoo5Du=d{FOau z(#;+E0n;BYciZtjWK^D%)GU9hDZrsrKh0|-bH-!)zr@e)y>E25wEp&_ZjWbh#dX&+ z#5C8e%Q=f}Hhq`+GB0%(z7f?S73^&aQ@E-8vD1W+&gI0x<~_JRnkGKQ;(=nywl=OU zMMbYHc=%;wYmCgHK9V>f#RH-{1pZ$szr2jHe5;KuT_(I19yk2k9zJlRTz_uk*F09N zV8*;C#sCu)T)!JTTD=wJnU7ly;6xWW%C}1xpfZOKtb4soaKA6S*XEX4lQn z%lVTUnuO=MA7?nb!Q+s`u4l+Lxo8rv2xI$cX>c;e`NCz&JH|BfYgrHg-uB5P)H~it zNUjd0)8m(vIsgL-kge+{YEHh}HR$F6n{M%uY}$T_vQ#<>Qa;j&=!_=;zObdmbY+EO zPxfA-XAn1Z>DA+%c$9jL1+C4?pU?_@B<%?r7r{sVLcm8+zahkRY4?+7Tu|Cp2-KmU zyi#D?Wk3$j`1~||2;7X9`gHeW%!oV_i$(082G=6ELeB%5{p|IcZke=b1V}2$YhJn> zboT<#$v?C==Vv;t{FRp-c;bgx3EWcVMAw30Qf>lNIWzBwdmR9Yh!R&8MUd7>*6oe+ z)l+$CYb4P$tI#5=0xmTz*nrPelLpdh%%;KWSNSS&ewh+FuIbk_LB^Q0QiG`>duGn+ zUkD5@aqZ}c5>Qhaap`qpzu{eV<%{Agir&BfF$|Rq$YzjgGnncK6&jqs#D?IrN6yoUCZ-qVgZJ8U<#ibD+NUy_h}!rW z!^8kE9#dgHo?wHq`7l3shi&|5q2Myd?@p5qHvKiIHmvY7=1ynbw9Mzi``*8VdJ~FN zla;ZdmAM;hH8$?X+jQG+s{BeQaL6_Y;t0K(I6^dGCn^0PO3dy@e#ikK#FpR^Z;AB> zwTphWF;r?rOjCq7WYOQW2gd>K@trP0NJ&n*4-kgEVdF#{3wYP?tx7kcARNb%L8dnk zAz)IG)%j?CYV0%fhu88wYR_D7TQto=1WE-iDm28G^;$%BZNiA=&t#fMi*l~~Kv`LU zN-tPI51B+mrb_e4Ly%g{_mn!BD)uT%;@zZZAs|%8civ*3c@*Jo9xb?Q6@D7tkU^L0 z1Ey^S&TLR_K^7mlrYJ5azb)jR*&Z7mI9e3%OI@x0O!mKTqejll5=N+MchYW`OvIg z1Ltrh!l&O`HHxhcey0YEY8s1InbMwOQZJw1s{^X|tJY?fCh+rXLM7d%iXLR-BI!(l zd;U(-=pVaKb*My?Z1+@6+eFUBX20kZ6sh*VM0ikFoI}le(A%d^1;smj^KcCeA*@Ui%`y^~QBMlu*ah4py++T@T6K_vF7O>)X26|s z_-fNB#t|Jqq<%XU_96*j7HSCZ2#enVgG8bC#lbA+P~K{ilbgY4;3*iIj`9HMolsqM z@b((5Ds!R^*;{see!J3zk-NOiD_)fvC>OM~mC|^M@`!MwlDY6(-;2{qss(l!Y}Ft? zD2D!+QI2vr)ZThvM){N5#e;w!TZD*OJb}Da^}bm7`mN9osRzD>)bMS)4%LXj%RDvn z@@WAb9Jt*G^zaw{5f-y${))FKn$es@w041jB#g>)Cm-={A-5l5eV7#BEP|i}KLeR( zQ(Jy^j3R*0E&h}5s?e7q1gTmF?w#6-(U7PeG~`^4Xe790(7Pv4#Wy#jW>5rrg?c(oUU_9|1ZB5-96>w7(z-9^7z}>6BSxixE1VP4~2M2VzRp^$@$iQ)L!P3l$z;ymj zhXPD{c#8S7D9B^(>eP<=Df`jJS`WS(>!`}vLm@kS1kqaWlDMk87wzLB0`SD?IDK=o zU^><|guH4Yq73fR3`P5*a7`5%Vbb}P*yvwnwDg>;G= z8&IE{8_FEVgcE5YPFJ@6v(5N*vQylQ1GE`z)FkqL`@)zLUF6{Z=32-hbTdY^9a}$8 zvm@qFa)SuLxUnuZJ@^X#(l!4=Nw%|kuE(HtbnT4FaA8U{gYu6qtN(@8RPLWN7_y*aL`B=3W;CnyJB$46tRB?MWvYd zM#KY5IyOX5p&>HX|8^EraOGoEVO;S}bL}+it|oc_iS^W5p=VQOGp%nmWW)Q%CZLx0 z8<@_D;Q(zNxM~Ub;3;*FcIkpXsNxMPoIw});q@Tkr6?8!EnT@J9FQ=`CvG)eLImROizzVDm*lRiR@7c4f?3NGa*N7eX;rgX znv2IgSftY@<9R?F0Nt3=K?Oa!^NnylVC8N2Tc6=N)DMerO2~}6Z`v_#x|8nA%ProJ)J$ywE4YJ1OzR^A* zC8u^|9rF=>R{7}Rjon?eHG^3-UfpfVaN<%MHzWZ*mdW)YLUoPOadR_GkE&gNJCX&! zXnNPct$^qs>WPnOi;KJPt?5q5WV!CS;Ic-pK1Yicl_(aSvFO^2Fy*AXb&vT>LXQF3 zAcl_Vg@pyFwU*YrLqa^RB~gBw>r7STlETWkC@R{vu4P;t_UDF)NC^1yo>P%6sW4BK zO=69_2kf6a3Yp04oj63ep&X@<@uxr*5Dw2$io3P~Lcx~b-VJCGX(H^-B`eOT}n>|Q98v;Wy{3gPiw?zaR#p7S&#q;OnX6Y|< zEbqeipW0#FJuPW!P@~A&V5^vJOL*~LBbO1?iv?$$2aV}u`pHi7y%C!}!rK%(4;wuV z0P;{!#kX+dsIyNbiDp;a-d-880pE&jyffnttCJsmWOn@}e-&GL)E3L#Xa4}yhb=P- z=$>3Ry#34MF|iFtTPW(IqRS$ObKy8ZO3SCIHSmaEG-}ps^4A;yM~z9Ud05iBE$as` z?mQ4u{Zjcy3{j4`l{Zov^DK#7xp8GI=XbVL7I+?v=saxB^b=eRUkkAN!L0z%G9}v) z2%s`vGK2%>8R%I^eS!t0P3UX+;lLPmjyd{k7}XLw|7&RxP;yG-bAoI;I#Il6*vv}5)7+G`LlF;f1t?}k z)1WnrHx@y)ikKHW_dRbU`Z5cfb3z}HMG2{#2&7*nCXo1+vwcEal;d6M_^LyYtrQNX zW?~=S-VT!AgHFjwh9(o&onmXDEc=(vUo-lR9gQJUPpZ&SxuL({_b$ymI(A67FHUL7 z58w9pSbDW&*OJ@DP9JgmjOy~7XCj@P*$K3hAm=8wV>!h??hcFtX-|qDDuU#IkgN&Q zzNb5#QW{x6)$OBHD*)n;a23d6{el2V(bHL4SrGU@5YaO(yV`T7--sx1z3;gy5F+NR z%;dKN7q@QJmWB~aA0hzECxwq@39OP^yCryN&VsgJ^;K-K!HnjlA>n;Lrp=s@A_T} zFA8E(O{76g>Z{Ecb3Z6+3;PJ0g(*490{x-`s+cV?R0?a-Tv!X{%*GV!EAfXyRvl+O z3#Wh)>_7%#NB0uq;E60aFy_R(u^{vEDb!J^|2WE*WymAb*0Qva7#OS-Cjhlp>3g15DZg~M@CBHpyYSw#cCxZ+Xt;PUgC zurwVZWl@xsBg2cV44tXssoMe(4sKi^({z&AA(2C0Jf-uYjaXg{!-!j=Ryk3z(t3 z6As}+wPjE>3(`cF-dR!ltKr&AQIk_vHE?lp2DzWv`kZINSlrvoQ~6UM+@}p-vZs7Q zdv8R0?J9KabL?XQDT{`}yRCO^7ZsTit4bCH%JeD@sHl`M6cDHKuJ>UTw_P2!va@}g zd|{WOc>?C_8cqj9a#CdU-mvz74A;mfpxnW@=ple;$p2M#1LBuw$30HZnWVpFrV)>b zcn3HhIq6$(X!rrrZS%{|EzBLu>)71z%`%*~30}dd^&Jx>>!8d93~%CO~s1B?J9rwK#K z&0~JP7NwHD3M)=>;V|bnN)c0-2iDC>Kz(L-h{(x2_HWy&lb~3qWJ;$k%G)iNq^b*d z>4l$+@bvoiRx+A(XvvhfK$c=lpO(Q{Jm)r>pBg1s7SnN|(Y2Tss?H0u^7;x_;&Kj2 zm+IEahLQg7mdf>je}<~On+5op zU+Yz{{}N89Va8DN4Gr4;{#)~5hrX@1JYY*yN1>-EnkjyEJ z=K5~h5*Z4v`O|?Q4r1kXcbTP~jt9fOdCuIW>+f;-m0GHYMF*S{)IKW`6dl@+Kc6cU6)}M{EoPZ&;VgJ*^A6|{sELn`&JJ|8b1fo`xq+^muceSkDUw~< zKfwJ82w4x(R~`vlpPG&@l=AOu+61LnpOy@O_@T>8IGW5*fQ|p_%w)^RgWQaA-uevL zGTJ|{q~!;va6q@ycN)c$4mdP_$3^{V38{SV_x@5r6jnnN zTgze9{tsK^f!4%P0F0H=$$U-*Ud~&i@=lcn79&pUjT`z`^6e)5N)+%?Z6CE4c$E4@ zm{&6s|CQov&*d;y_e*U2a1y>j zc~y0RUBYM{{m;=wU2V9_Y^I5jSRu$L%zj+}n~z@d4H8BPXzKm|&D=laR4gie3L;F( z8*YXmBSrQ`=EMG*%tryhA}as}bFQ#J&M^dykoX?{q8G+=Te?Qke(&c3lT9k?^+u{A z^K$25fg;$BO#-z2X*rqw*-C+oa|GU;q(`B5KK4Y}9yh2}|0F(T%M^rqaPmVYnaiKO z1otAhtpA*=N3pI(l1<)+ENxopn&UtQAK{^ojURgpMM@R6)+4CupWCZe*%iX%)#{yd z0YZ=KJ(QXQfG2iGxoW!sMlb#|j2J~zzqTo43109wi7`=A324NRx0u!9> zk$TZVjQZz3vYTGIVgGMUXmD9*3@h7{O=zdLq#>6;6hH&ovFFjy{d^<*LoRU*TI*nc zaT95iWXtaG&KK5rHNF3Sv$f)XQTchcz3WrFLrvLrlF9EpAx8v$+mw>!17toeSxg*! zWrm+Gi5jy&H(ehe*G~&OZUsbwKFGeG zvu4$u$LNgDsj+16u3!Vu_oPH%h&p7BE%JFoF4s>iPW7>JoSZD5ge9+JaA|D11LPbT zT>yl`Ot?Ml;R00b^N-j+n439FafSw>uh_e`maM(lk7S^(q-#Zmc+k}i+`w*4<9L#* zxt_YcCuBNNXkdITescTVW{jdEK36*5B-^LCj|)T`igOdbv>6@iN}#FTxYIHIT4mP$ zL_O3$ab3gmVnBKdL?R6!W4_#xBWhKTFTyH>f+z%0wlIK<9@6w5zt&F$Xdw%p3~91} zi>4p*b2&h)HJ=+WE_CH+`X>++5zM(2@(L(($ulz0Z9CR7k5TsqTo7fH^6R)MBz03o z06x{xKcCR}KJsrMk9Xdy^7gQ7S*3< z9e<)3_n-CY1i8bg?Kw4KQa#>)yQTA*D@ac{DE~7+`8TN-(;D$lwo@@!*sK@s%pnss zCnYuJ^1LOq9j8IQu|zvSzKlG360r|^VNT=TI%0g~^0B$uwAU)_ zY`=U4#imKxbo)?NU#EgFfRO_x205+OxiVL-;yZ7DY1zOI(7>!S&!Xpp#DQa;%DYQj z{t=ae$aSUc|F{q~Zx_OILazvs!vPyFDB9=nprZPb|Rbq)YKCpyW@=Y9I) z00q8VQe(a#{m*-A7)R5iEeVkFy;5sd2QU)@p3TgIys8qi zLuvgBtlt^oY_G$izCB%$gXIwyPdFgd(+f;56CeMJIJfDIEbyxBDJAt7Lj`UwF}Zr^ zqsz76Tz<_5NM_OnIfIm}?@jXrk&^_cB@u|nb@F3|Lho-Rx_oJu5FtkGifd;6wJ5vc zvgxE76n^~Qgr>?@qSlk7yyLRRTBH z#uEVby0%A{B(2syn7T2mV`=mCb=m86o-?}M=IQ=BRv_XTnM1fLk_1Ig^q{}DrIEt% z7Wcgp9m=A8={KG|j@&C$hM$#>42t5q1A&yTQ%xxvBvh`jtVIiju@=>csBrMEKHpf% zeV&;gy*0{amk2A&+?b5xW-VR!vW_}#vaHNcwXCF1b@`srY1)c*L5rN`pTR0AAB4=~g90;f3NAI*u}=i@H+CQFTQ-cAdF&#ZPe!!VS}Q(4Q)|+~ z_><|}sGnU)QZS0Weq>mEGC^X;Jouh`rlAAAfI154f+-tF$@g^89ZTEbU~I}a#ZhSu zYTS+kc}+FYcKSl{!h##3YyR;JMX`?#AOQ#0bMXUe3~+J`5+EI8!%r3f!*#)fj9x_F zIZS*NILhb;$?jL(YK>%vkK@mM1Y%WqLin|9*}y{sY@oQ6W&;@f9L;%oAxi7eSA}K~ z{V=KBa@iyxQJ|wfZ!F*dsxl7Imoi|W^0U%L7Y-P}jeq%55rZm(e^q_S$3y)*O=(qcE=ko*|@XzfcJTND|48nhMsl4B*@%ep(i4p?6?~eE#zyLgPL)At; z15o!c&~q;WO7K)D?5_o2D-~0Ccm*ghu)c0$_sKE=Uj}`WlSfw}%=OlZG9)38M-cnc zIX+mQ@Hok>LjWiuCUd{yAUnL#{jrGyhMtHQxsaF4m-}}lat8-c`Fxn{s)hjSA_(rg z;m~)GNvMeU&~U(^g_>)}GQo6~5bW(sq1F7iM>uc$lEII>`tPx;r6 z26eQK#UYjC&Yxin?g@bWm@~qwDW9r-**fo4fW5Z5^|0X610qvCO_%D&5f|G`Qn)F7 z66rJn$};9|3gGEwvy~zl|Hf5*5bG(;lKq!kt0NU`3I4Tgz zqDw0D&oSVW5n$|DT6U0_m44$&$lG`EgTv6DR>OHnx4aF?b{WBd0e|$!hxu~boz&Hu zq7E&LsVxQ7LHPD(BPo5ztX2kIspJ)gbWtz^Alc?_$y*;6xzs(Z;m30BKb3DVO17-6 z!TLUP2L(7tRyI^gNaCF;82bNw3XVWUdkaHc)igl6Lvir7u|#_p?YQ0^tJUdF8}+0)wF)j9VJ69iyv6u}J`8{I;q#>kURRDdt9M;I^-_uMek z>RVWLt^3IPU>FI^0itc&@h_CPXls;!Y<;peBna_Ki1b!JTH%~t99KFIX)$UnkqU9V zn9TJhxlBC){qKhdO8Q&RK?P8I)^4j;(Ol_3uI5(_{Ri@H?~4@t$E$hb zs?W~dNGy^jzvI#+u)iG1NM;21AuOOF2zpMyEA>Ns3z&q|E3Klfay9p0WOIAfuF*-u zN%JWg?aN4j=khd0y*D}k9T@r51Su?GyJa13kYBJoV(Y=YdS`z*ywhMMN^kHi)acpd z>ow;OeL6KM?PgAn)#b5sr&ibZQoHvgtb((dZLtLM+Q~2x=(>(A`REa<+^WS(I%>&j zBV!hsw5NPy7c#p+dQ;fBQW8apr+tk3;k@HD$-8Lta6hE$`(dNM)6p@MdWL3XFz$>} zxb3^!=29o}>3$~+m4>0P!$|np&^xt)2wN`|nY_WE@VQn25^;Msy=F|;?Kn^O(>Hc# z3m%P>q(8?B(Q6v=tWihmex}7LrG2U?89B1Ht4xT9n`Q5a-!(vXg{?V*sh&gazpE2T zrlp6ckA7`)b|xh}+wstP7goPj6xK(DjAUu_yn3Tc#P!Fx{-ux^L)s*12!Cp_44)`A zf9hdXlT}hIk-Ac`Kh6NHx?Ui0k3kE*@`IV@Q78x>c_z|-Ak@7tONb5 zBX*Y8@k@Koajn98Kht388Ce>xG%9=?o4(Ld9?{B=9sBgU-R^R}uhuVPR3tb0-d-M2 zzpL2P*TDZL-b`Ta#R|o4ee_~|Dfm%^^Ukk1GOk5nb17AsL z>hsxS)2NDCUmHz@026vUwQ9_abM>swVR?#n4pOWXV{+ej)n3@{_;*&+G@HZ++DBrs zCA`5rYR2WeECI6^sUS$EmiBBsxPztOr4*5Os)oY^^mwAlv>j-gc5o2Gs!0M!C9EPJ z^SoE?S*0l}ea0yg%WX)5s9N9%wkaqTwKdlm5x1>BwpGx8Igjd%2^eUoUFr= z2;5oJ@bOq|j6m|8>Om9ylhHf9QfF<@bC&F(lhjmh9N!P)IDw@Pc4^(EoxESXG_5k| z!sXk^J649Mn0Da5Qw_0_;@phR4@2aXV77uYm{kpmq}w1RwW}5G)hVEsHkM zobusrVcg;F9!~kvrcRDJcH<>>Qx4Ja0I+X@PqQ#54incnsK0Q|rJKq)-2$a!%0tBs zAy>X;S|MVBqR~Wjy5Ud7I*_^bbSgZ445SqYES(`v7RVI6pBow}fgJKDEs>(CUoEH> zGqJwUq&Ai0U__PqsEMw;rflO?`6KXaoYr+MsB;^T|L^rdxha zCJ~#8V%eVhF^4i+oCy!jowbvkVWJ#O4HxDV_Y@fCw7nSnT;(nJHhk*Jy|jEB zX@o{&Q837uy^)am_;M7|V#Mpo!Rtv=c>JwuCLrcSvP+(t&oeCyXXe$frb289<FGB5q?mT?HGy=daYCR-R?si*<= zPP>Y3xyE{*@d}>v#rY+DBF6>(~n&6602mDwI0 zMEG8*)Haq@kI%6V!Fquvij+(&Lb);I^S1vi|clO z?fg+#f_jCNeoU&mma?1K#p98*S&8~waj%Nu)*(mhg&n%Lw}XgiLUS3HlH<@Qnd{WdPYkf}p|_Ei9+`x|9$8 zGdt|S8u0`V2BE+RIky73=8+qEwUr4|sHRtIhBT0(mXFxkQtt}6oQW6e?ac6M{OjBI zs#k9!#W-PPqR5}8l@U9_R=*prOiT?i1Zw<*tV%;e${Nvh#Uc<)qQ8G%q>Rbtt^FeA zWBY^Ad-QzTBTuA;IK@<4993 zr4lFX}*QSBdk{E0EvWnblt(1w78j zO)J0n4x`ve$ZUoocB{_IMkDi-k zD;Q>iH^KGj;^1}Q+EEg{aREJmu#=96tTXe_- ziS>H~XL#h_slkitlG4aWx>EfZvq{4WNP{h()miCcxF0jo zJ$C%ngiF1y%KM!P-7$FmXmPEtKC+E{@rnb40OnPiy^-AW%Q`k>R4<;H)qUqym({^d zkMOYW(?l?6i9)QV3|v+<35G0evj5_ArE6iF(*OQ_*Pn$0xx@!UmP@ynM?D5?^>V>+ z$K*O3&t8v2+5tQLJw~dU@#~V)J@k9rAgN!KY3THG1cOk11zX4b;yOyZ3Bkf3i{k;| zebmpwr(FF{R}gV>l!n#P~4Ktp4s?g)g z%{*v2>nKxbhCgZs&->!2qwpYS-<}r-Fz{6G>aoxvrrz|f8M4Gnnatq=RzNrV_cX|) zwWImkHkUb-R!Hb_;tes(gX?heOpSGG6RDwZP^v0@#D^hIS8M79Un>w=j)hGZgx!K63*0U8-M zxV<-7h3F?#KG^IKXAme&5$}-1f_=HUW3*I>G4h)#6YG(M0*Z5~0aexs zz<6z#fW`8+PoVm=?zna53bQZ6X3H#Mw~iIuIc*n9!I)Sz!k~ku`Uo^m{cpG~*)p}V z_`d*V=|m9Pe1p#1@r>T=(&|E%tFHwuPL3&u`{h0@BjDPV;k%ydt9o3W=~PfxDZ@Im z+y)*bR%+g1Qe0a7^2=?;xgiF48QF}@aMJS`E~jnD(8$H%;2~1%>bo!>#zLUV^ zRn-e0-N`5>S5ds^t(uReYBv}J40}PiOKUYdmy3sMvXYb+G)EXh)JM&M5#!b#yNp6t83ce616P*4mlb``BbLiRCA+(kLdF*?9 z@0f+YHfh}{sp=ks{%+l~5p=}7(a%(mZ714h3uL!>{W!r6^6&wx)RYHFzbgun19|+V zbz~)ekva4sO0tWhj5>LVjkqEh?ppEd1jB`R*RWDHowO1jFl$&dl^FaZw@$q0I;jRR zzHv%heqJ<^Dt4(Zi2|{&DK4bjAbE;Sqk)k>>GvG7u`z}uo~CwSJ>oe+IUr zkbSH|)Evu}>B3*?IGrefg8-L(iEteLjx?daTl6xSg7wuFXIjJWt%F{6L(etC0_teZ zKmJVC@6^6*Qes8s19dcHk_C1!Z58f?a=}TBpE4$WRX8dh(6YrUbVqUm+f3BKE^Mg5 zmqLyg*&L$kl`=C`jWT_)LN6>%LWTT4K5PxMHmq+VhMcAB#kSS{=(O~qk#jkasq};x z=*L&Pqq8l5J!y6L-hKQa?@QbLCl9(EDC_s$Cs7}^!nDTcIZ<4g2|ht#0KGSq;{VY7 zXJToBVZwD6f*w63lUGh^(K%{;-!1g`V}CJ(DBgnSss@q2DGY`JktA>D8MKVaS%(Zv zCV4MFm76J1fRJnq+MaH_C5)8lidEvn~I zAbw5VKBoV*J?J|~O^FTb;n5q~7|0}FBH|5jKj?@EMwuKK6k=fD>rG)3!b}#*NntXt z+TbMKfC3~bF6x}f$Tz3332+J>U@^jlfizk_@plf)Rh2fP16@g+zPO{b!ik`Vdc=1B zE=!2w;y6S=CR_XyAXpBx1DjM(<&bJ0?EMAhe7i0}zJqpgP=xULAU@}`Lt4XD94Rh4 zDDBpT1~u?Os(;oadWt4Pf?6NA@`P!a=w;pD@S_EDV{Shc22J*&aj*ox+afkbGF-y1 z3bMGZGQltIiQ`IX0PYZr+peZdVqb1|!mA!vh6OavnD5P<*~6(h@XJ9cnueF|p$`$< zA4tu=x0ZoMnn@63-)GBc@_C>7h=4NjL{z(I0qL&XGg!Q}3k$2gCtp4rF5cE~k{YHw zNx#IJe|@2>fj$^~l*E^NFhlkOCyce>!bj^=sM4xk#m6bseaR&*SuVJ#g zvcp9ePsHt+CY_8RyQB~o z{Shh2{VeHlyLOoQLQS(Y%NECUy69YU@jd9g!=1gSfT(2M^4jeh;cI&i%8!<7ds0ja zd1vuQN{y3JAiw=Ji*%zeNnO9}*akECVq1ezf{m_aa(3!zP<-o1ik;)!UY2xWjVT_x z6(%P5t5QIW4_Ne*pD(%^q@k4s-#(b4z`t1lObU|xBfIDL&SJN_cnAJc`@;$20Tg&h zuU96)C&Ri3*IMO$HL95?^C{`f-CzWw&q>+Y1ocxYIeWxRKYjVH%!3H?vFL z#k31!md{-gvFa`1TBK86nw&;N`*2YUWAv-)l1gtTu_~5C^W9j8ms-xncdz;F$t$vp z)x#5a=bbfmaTMC!pmc}R@3$CGr5u~D3M3m{z@q5j#OUK^4DJtjHl??8Ee&4Vf61ag zf+NvXv=v2ZFuPNINhCgA#Y-}tEPLAM3;&g-;6iwk1qB9O3e#z}J-O-i8hiTZ`_%bh zwqpoW0bM%VPB{H9-vW-d6^VI*NpVprcrKPYXp9TAf$ftzkG=b@|-X64o!j z-+>#VY0f4(?~sPs`YSaHj;`V>N+#qu8<$df>cr^>@B7XWc=#AnF!1x2ugqi#U4?`p zxg*H8NOaL4!Ij1H_g?DA=$W{x5aM!OmJButNXe`pt;6|)$ZF`|^YnUeXC>7Nh7%nU zt^J7;I+_M!o1-f3bFzjEW{?(C3w(lJ`6aVI>a2K<^`Q%EHkT-b?>mGa z!<@!jW1v>aCNgMy<{EqX-4cr2g_V9Hp@Ch{(154lRvU*_cm2GA=8u2n%t8&M+Fl7+ zqLra}CE-D*AM3QB zH^+mVRrOCN?a*4gXT6eh#2aC_`-Gh^z2M38SXlxMY9~LpNv3Va=;MB+rl}f^KW{RG zqGPmjsDNWVnEn1s3jS+sv2Ze-Wfn%=v@)Iw|6FMukHf}CaK^>04GuMNn!@A`8J7b# zO}Op^iXN#icOl9&>t9z(NBGY;aryYc8B0Xw&eF9}{M&+(k1tw4i7@?0fp|!H_klU*s?8C$%9+G6O zy*Ag<+4bP26!aAVxrArz+svEd)9(%m74Lwat69DG*M9o71l}SCVV9vhc!1YNjAJv< z@uc5l3=|aR&;Ar7D%@VV{w`hq8)W-kw+FY^=Q75C_3{_L4 zp8abO`Kbdw_UkL5_PoErt_m6apVvi#xn{amCGiCv>4EJ7$?%Yom_n+r3&pBI!$j~0 z-1D#XV$kNnd@@eDPY+naLW-`Z>hjU0M{GIRC}bN2jipvG^d9 z4D7JD9ICEPKu^yOUOA#@*P$2cgzU(WIogYNq_YJDr$b#Reu;DHn6tA&!V5Z?ZDNHz zEuO(eud{ZezZM@UyjYe5$tu(zF;}`4SK}BM+Rd;u6BFh5e4KLVO}FV+BGb+2%JAa9 zJZye1xF;c9$xEix7_<25C(5NnU@47hXvg72vh~+uzd*PRl`KW z`z%?SCaj~(+=S;5xwjHm*go9M@50xP2c6t+7u=cEEwM#lDeshsewRs4Zhc05fU8h& z1Ik?1Q6i^B2nl$ICS^ojWPq3K4#{+JC8y7a?>_mzIHPuUXNCl#Lox>|nsuaOMjm2c{_TBVtaas=emJdKg24{xod1b1!|> z@a@ac^^;YQwMzZ*ht&+6Us9znL+)RWHwhuJWZCVFBhb}K3;66lX_zFhw|_15N_wSN z(2q-oNoO`GYr05ptD zSP@p&QesuU@K=Z9B;9m8neO;Zume@7eN`L<6Y=DT+o;{=h4%!0Z_{Gm#c7%OiO+=={g7g!x4e0%qfLTY-XG_jI~KyM$?U%%D=PVLTU5p* zUTZoyr$JMxxVrwdAD&^>C9bk9dq^$LN#!_s`#5c2xBeCwG6pn%Eq_`r>{`8}z_Xx1i zA+7>`i{wQHYL$IqChK{B2E)s{)JH%;!fC0}gYm=Uj0q|33=JT0ylHNQYskRssZxDc z8rVO*e@X>8*GwYzzMK@OU`ow3>)@>oC+XytAWNAKu&|;418tJ7Dhv)duqp^angS#% zh41}h>rcBsz)a2|1Pbb#G(4u<0)1%rnG*|e)dG7<8P z0IXGCKau&j%_dG!fkrb%ZBTaVzZwa*K{D};R^$fAwf;IDn9>Om)<4%kCkcrqdO`YU z#b!~sWyf|AAMWlgrEC85a>`xl9~1YjU-?%<^Y|bTa?Sv#^V%YT;H+;UUH|hbd_9gC-U6*QI+*<2lAVi(6@B;{)u@T+Q{&-3pA@ zU*C^^tnC@2Mo0AB9c)d8^`r(AX$eR${`RpWzTKLD8#J^ym8ePol8OK9po2Q6t)}aC z@$-1kP*iQm$|)UDZE}~yvQ`xM#*EG~Gr|{aEOS5kE&dA=jFjc&rWPWRlT|TNjo?jJ zm*vwElT)gyUz2xbJW|CmFa2#O+@9>VkGnN2R_#*95(3j+3kVU)x|>GrdAA%r7`3RP ze_Hrj>B6t$9XBO@pf%xsDApK6YXGYqPNgk6p&tR4ZM6%Vx=__voWuq{s$1a^_0ZVy z;U9_bB8R1^Em~V6I(*qVuckizlDhL?=4NT6N4@)?YTD+*h1&Y&$uZ8YF`A>r<`k6v zWT|GHOA_2fLm!V;=-cijjHED1t=HDW;3`6|#(Q%W+tu|tFeAKg;;xR=6n-4;%h9{K zt@3^No^#f;QwLLLHnLeO=fBPFcYS)UNgnGVGxKM1T8@;p;pvx@@8Q${zMvkdw7;VMbuH~lX?E-5U z)K3G3r3GMp{&>`oeKBDn!YwPbl5=fk*}J2Zirn zl_rC4b}|#c-C!w$&45Bp4I={QL|WA67dQU|l0cURE_lU|w{q5ztV!q4c@+4O39lPT z>Z4wH#N+y{Lu%0>{~z|=Yi~E=aOm3~6h#ysVk;V>pSLTw#sJYuRzm(@`&5qU)}oqG zJ;J~&dB>MJ{JI{23U%W|TcEYLzvE}=OksDtE9pcyLr8xZ1W4I*-Oa2QowFx zd|dfma#FB-pIh2Ig#T0QlWAcAZEY71&A2zas=IMH+<-O7hEU?%?ZsL1PS8s9F04!p z=M9O7X*AzRmX;cmL3{uC`f$(Pl26qka?h@Psg9UdB?(P~S!GlAP zpuyc8g1ftGf&}*raGHEsYkh0g-c{$t0n!$p%q2KKURaSR6S^U6zrpHd?zjRY;ElOpz z$RjDGaEDu*U%V537nNf9$gd)q9sJNHM_n)PoQg4sJlQwlxQuotVEyST)Y0U@jxp(t zEIwit&@1p&9zW*sX%M(lFsp3-a7?(T)YjHCv)nV$;()giPG6pVTGXjb0R0{!y@nuGChi}Cdd_@;-YxszQHq41`WNAL67)5a?q=XVNVY{(s)O>2&_y?te`AKe)Dqo^!2DawX6ycM9!EBzG zueLr#5+4?UgrXi6YNi?7nhIVKrB2GrAM?adSG?|ZJaeP2C8?2S2Zwcf+7*c0KUDXbPOQsy#Umwz8t&In6<3x0Yh7aXIVr z$L5s>C$&g;zlyP-(rv!qvzLf9v5G0T87Dg*e$6ZAzFzGVshuJS+*4)KU-n>`567(P zlc2S(o!rT_{b<1*l&?n>?poc+u3CsuO+`)Ik;oxLgAqo@l;34R6TSQ~fWNqrE2{=- zI6tv?B^KY32zW3I4ouv%oXfQn$V9<#7OC?4Fbwos+53<6?DX1C*MxZSSxVYYDq%w> z!LhA|t&G>42W6ZYxE_kvIfPh`h$SMg-Ld0OZzXV^m7}zwTvF6%O@Ukax{PVQBsg`9 zcoI^EsnzvlEBHv_2Y$6#d{Xe$ogNfRw<9>Hca|w2VyuRB@pNm@Qd4GJx=W{_yBM!A zS^RwT;f=9;e-N!>DD`J$Nvw?HU7irgLk0LMX6>9P-;Wbflvv8ix;;zh$W(CF=4@#O zDjE>jfodaGbf2>GL?D+EgaJy? z_7+jSSI6aTlxfg)-g|mYz*lUUDs35|ANfym0lt;=y`~tFR%}W3d+NSC=U$_~Y|xAj(mZh&SEH1*qV4AOBFx?vT8 z={{B6ie{o#bGam^F~@T0;7QO}(>EgxP#W>YqCkLiJLzKo>YOSf)P+HIsE8GOThhMNs_PyO>{0-n4V~WZPV50Qk*ZXjpw4KHsn*fuqt<1e6K0hV4L49yT6*B zY0`d~H+4JNSS=>xbBFJHJJ8}g#{Tg`1z_dN1*Dxe$pErt4isZkf+2DU1Edh3+H3ob zuUlGkl|~AH-wL^=-gcjpT0E2fY=xjbi7gz4% z!j}Ergo8cNPmutU`yu2hUF85$uJ@uZkvr!StQ6dq%nv2U`kF3IM9&E46 zD|`A@Ezi|l4>u*~2{w7lOUx+^R;*dUv*6PZ+H5IKuJ&5fyXH&Da0e;AmIki^;icJD z`r+Q&AJ|X|Hbv^!`ekE^s5eh#!X1k%Sylk72POm*RM)XilEfU)-1w!oE=YQCg2jtG zC2vEYpV`LjKGvZ;(bD_aMpU)}qR}esx%EkXDo*fCoK5EiML5}7x)^})OzTLu#M4CI zo1d5#g&bgMCLjpL5(vycO=J%Z-de(wX|hwDr-n}7-jJpCIvetMjBruJ98c$F;wu8U1dP!ZvQo|qA*wm zt=JpYzK^fSPu!Bo4G9viKH>afvVC*Eg67xk=ER(AP98;b5(a=?)?L|ZfZfLncN_jm zx}(hyqV#v+Xw+*fW{q#ZSOVpzaeYg|Q>FR&`kr&2B0#-K ztI!=QuZdz>Iu^OI7L}WA?>U&*yz2icO$rTl`1bQJ){Ix-3_&Y%-DjX7lkOdsA>E#} z3$R0sKnbhB(-4YlNvF09Lw1rt4a)jk<{16Yx_9zL9VOCbvT08rB~;vZ`jJ|w9iq9R z%D1v+Yl_ilqXtoTHZ=7Iqt>T6r*mDzZ>I-H;h{Fg?_D)Ky!k<$$1k(n1A-xkE zypdTc$CCE?HI7ts=n3X~8{fpOK4N+URg%|w`TbNzyVUMyh-zVNLk5wB9A{RD-Zviz z@e2JuJPC@D5MGF?*gQGaRoXR9stv7??yNmty zzTFjXPK*g(X?8yp;_yip3YyJFP0jbA&(tcy`&=l1A46zX zxz)fFZI>`^MTPfLcNLWf9uXc7ugPj}T$J=)mvwtXbo|{-Lfft7Qq(r~_s{*ub zE$+(O#R6s885Bs*Jz2f5ylOV_AEssvlGIl{>hh z@vmlphQQ@pS&b($@wNoFB~;03!czZA7<5bG`6+v*OcG~(i@*(B4t z$YTSyX(C3Gf&fxJcgJL1cRND;e0unPm2Cbr9pr-c_CB#&s(e(w8d`X}_Z{l*LfR!X z=^jp8tCw;{-=~qX_EAkwyhCB%Aqc|aPzyxWCr1+qoc$DJs{7H?M4nB~w@%VH=gu4~ z6NFgWU5R5Q&&U(3A*!@wJ7BewHjdB38G-p-v{Q|!)+>oq(=#D@Nx?*Y2r(*51&+Z) z{h)w-uo)SHB{WF6SkcPT4OJfMyUT82&;_MyU<7a@39`p6nZz0rp9B%IqS)?6wLEx9fjIP3`nzFCRB6 zN=cjRn2n!%LVlNbnFySi(F~ftYakbK)0-*nvu!@8(e11*tP*n=QQGmRZoIS+KFtn! zo%pRUk%Y;K8c(y>9V%Fvk!h0rcL{zH#yP}Sk0~eLVNbN&knrWYvy=av$&vQpLZg>t zA!8)oSK1LM>v!~$P8gYy@z^HS5jp0kIZ>9+`0d{9rJNGq41VCgp6bFLL1X#E!WtZ`8d*J zsarE6`}>K3?lwNeWp6fO@tcQR|M;^Xr)^K4ihh=ngyt?`Y?53&pbM%@PwdLx-GQ%e zT2$-WG`wnMjYGiiqCivRS!CH9UMkDcvUw@mVP1TXC(ux2n3^Wtq0JGMCFYvG`LXp8 z6W4-LJuIPY%1-e(Ryr>kaqZt81aEgwe0@G!8_&mZqbcWmdwF?3#UEoUHW1f<&E*y_}ybGHx5^W92>y#e)@*XHdZJyU-*y5aM%)R3H_6 zVpPTK>R^3^VhON;n)sL^9CKlgi?nwOHUg4W+%KR}+g+l+L-v*7esd%5k-RPO?9F^* zT)rJ{@&FJ+OFoc%;w$U;ax!nDi|Oq5jVquY7GwxBA^Kx#5#50VZhQ^49O)$>)hG!p zUi$^m+kW7sMFzq*z41S!UjXnL`O}7k3IJ=rPHZ4@01*{@R3EEw2xxT`i_?(x=U`6@ zTw5)P%^8NRK?6+jUm=jdU8*IEVHLnw5DZ!V9u>$W!j>@-Bcy(KWVlJg(k_<9}cFLOqE76~yWnc`?JgY5U407~r^>Ma0lyfar!QP{;pH zhCFX51_e-aLqomg(C9!WTL`WcwfFR0m)3+{FoCnMN^hwMfr_en6V6`~vi`PP5lIuF zm~{i%KLtoMqY8HoaXh;auoGh52n!h6^Iz(>vxY3u5p6vUUX}vJ$1Mz$2n0L??u>Z(3 z@8xxQ+Hyg5JWM%}{?GRKA%6@LV2{JDbc_N>85#$m+|PEGzrB;@F)RSMk!5LBa3z&s zIR7kp03k=0owT#}M;w2rNVYW|SJ%G>a}WW02v{)j@5GXyuEOa+ofvDsf>yI2!@u?V zqZXli9M>O_>Nz#opuqS%=iYxup9;6AmfGzi{$>{W`hMxYO#)U`Ok`q^_>f94a{u)q z40J)ktxyDXZV{J!0%l|ay3b!K&+#Ye@J@FU8-&&fOII}2>=2WULqO!&gPD7*Ig+lj z16+`*HNu)SsJAa#oCOLVdxVIF1XnfM^hX?s+{rrLJ?e<@8&LB_50~O6^rU@*z)kZq z;(gaHQHBO1AejZz1TZ5)nM97wX%;7+NY>n6ZGkHh;=N%2*`AWrEp@;2TimDQ z-C%$wT6Ag@8to?!My!f)I{ctsJY%b6Bwd+{+q7mEeW(!4T)cCJEemg2kb3kSruo#D zcDxg{C@)56DfjjR;l970T|=M(29=w+NAxh3WK}zV3F_OSx+%{pY$nk%1&R$kwWEh) zhlnbsN(FrdD&yRU-EGV&AcbncqLUMdq* z{?_oPPpTYs{0dD>4F69Bxb@&uzzN07Ug2p`!91+{*(Ah#w_A^p;~vvb9yX$qQv#rW zjM%DOcVoSkF@ufU)SvI(3aq8>QVfC>YT^spJ>|-R-T>ucXVUgA8*{7{_L~*Wj5nMs z`}L)0KjG^E(_8pPyOcx-FKSf}xA*>VF?(6E2iZS%J&?XHx;7_D)1V{G!XrS#%0qA% zQ9$^FkH@ZEW;d%SI z&R0yCFI2DQr;+AwRs-3H%S;x$biy%pPaUIfNI%8M{7fPAX z+Zzcf6v=>~5wD*@)HkbLZ3f;`Vn}Hx+?rT@_GD7;Vlwb6pKj-d%pz`3(C%kV|4gnz zW{i!G{kDqlA^gMvgXOy8gu}v7Mj;hCNf7b~;VUD$6bA6AT5SqsTRxk$pD1bw^WuS7 zeFShKnFClLjaANv`m6rufTcm2L>kX&HK#dQfPXtuOiRQHY#PwDQo%Rmm`=1ALdDOH zHRpv7x3e-=_S_HpIWmi5STpiz_Te+KJI-cMA*h*zvDQc>On1gi*H0tF?CK=2c1+GUX;%yqO{YpBCEy2H>4z3 z$u}sUYw~ARyDZHru~-%Boqbc)vUTyuUN{^%*=3mAZqAdya~6aP!H)h6Z&^PDj+FnT z=U&FvJoKcUeq2DtcO_#tu~q?;JiLBG#cn;-Y!s9rtUvswgzB8C%mU#xTL7SjYl4pt z-kHOwBVw*I>w}`PE(Ox!&Civy)C^OdDtA)p_nS679qFm9={(IMfbGNUz@)PoklMrV z5-FPGH}`M9Th9?OP8T+1fPmo8uXX%t+W{}O!LDk-K_qy*dOdRB7T+oeB`RHamI0vv zACq+W`U^hAYej+axJ1&oE9nZwgx1>7Y{|Z&RrTZ>BG}?B!4U6;xpt8}u}HMzuFK{n z9Up2^Ts5MI*fPzTw!@72z3Lo#m9lB;v#7+-UcfizhA+2dS_Z25tKfxG_9gA;1p|R5 zHc#smV0(hK@^O?@cl04ppuIoQMsv$-uf4os?5gksK zI^3Rvch1rk5r_UVY?IXytVv|?*K>NoPUV@Vm|D}m^QT;Ow8Ood8K;VQu?L$03dOTX z?ecQup=N5bkYAXdvNyC^Mc4@h6*nI7F9X@2m?y8&R=9}PacY$XN{*^Nt-`(=L%Jq) zlg8OhV%0#&z2+T9jTVLh@lYA3z~c;Y;Bhl(9qu(a90feH!usK++COX+=-${;;a>fa zwA&Co&n!K@2|dS98?ZfXX1&k8%!$^t-!s2Tw5+G$pLa}dU3dozkk5gAonoad=@w+x;efQovsAW*WQP^@NN)80<-QlLt1|e+J7X14+6^2?Y|5`8+vjjL0K=E zC|?+$>J@uXRuVHZZ{hxW10*C5P{lTjBAEVLtQ-jnkqDjx!Mg(3W*I*n<|(A56$t{dF5 z(fi{zJaXJj$dnqsKYvhTw(J9Dkwx%!V|iw*k7EVosK!M%ZqCk>g_V!x@VV=LM1nHv zow5Cuf`4s^r&?0P*(6|`Ehv{AwP|O3AwGW6@%O7~5}j3k#OPmDU62@3tH+_f-+ueB z^G|D7M&<;N}I7?-iP3* zHUP@VnlE$HGgMmna9;-|`t2ZBTI65A^Y(Wo%3W?Ho`G@qiM1rT z7IYrnDy?0z$>DqX&wdQFm4%b44k5mq{L!I4YWnQzDI3^)I`JPaAQznOz;E|YR75wW zM&J@H1iNHJ6p%d9!OymR;Mq=B?a1at;SU9>BH6n~i}lmX`S^z=R!ej0w^QhPAcR@v z%#-F6E!v-IGg~?wKJ-trXxGQc(lxi3`_G?Y~XQDC}1vfmG_C!>oG@qXKp7! zqAxUETjN`l<8%k#=WPUD`m)I;3^~z?)li9C;wl*5{UP*7X zZ*+4&lOcLJM&t(!PHHXQ*jm#9)jGry2c&1KrdlQi;8q$a@6p&~GA*4@);a*tpjT83 zLh>O+yU0$?Z-V2_UKDMJq3O)~vl*a~mtuZlu;lW336=I0^pVkZI0TXb&Zj5q3?4}$wejU&ahCSBvCdmOFd08>ea%tc_bJIyW}!uR zrBkHN5`2OHg{EnuM$1*#thQF78Zx`=cy(q>^$XJdp&~;qPUcLKIT0;d}sfpp)VShAWKb?r;@5fB^ zyd=O$P4S)=$q&ii%SgTg58Qotr~8yreT>=0Nhe%R2MUh8Jw?q|ZhyM>t$|QE?0#Z= zX1(Ee-Z4{-eqSjqMq-IOIRi~UB6;~g2`*xemluOR+sE1Gr@^dxPj4!(~g_{D9md=gVZ1pmG$Z0G=5i%Sh7Mkt-o=C*#BtP9~^7L_lI;*RW{tXmp*hq2p zPEKB(n>t-qcP821q~J5Ga2c#8^v(eICxx|{&jEvLId@|%ywo`Gp~Hks@Y5j9&n1y7 zDPDq*%5R-W;dQMPGt2j*kB1Hy#+f?vxYl5AA3L?m;^cgLAKud+^+L2zoxA;wtDa>F zB}Z+jI19Q%7mjZVC%#mIiw?2YtS zg6XD7KC4I)iYniY4#`rNQ>PH7s4$HV?;lLg?5MuB8sS;c8P)%ItwS^5N-MjL(zi~M zoci?(l{X#1HbypyR$!N+G&+1-rxgOFQAbTDL5waXU18ssXIXtA)JUR-ryqcq*@&=L zxtUCV!OJF;Kk(8SmaG20f;sN}y@@XN{{7RwH02BN7%%^k8|R>#N7c06r&p4lFgi%x zd0jJ!k;leYv~A=-{Gjo%Vs>AyrgJGpHb40q(Be&CQcW0cG;w{AuC_mMLF+0>0Dqt+#~v=CI#@t0E}^A>rWMM z?s?*?kYn}97-LR^vGg-OJ&X#k4SFV-N>QmlZ&0*(ae)?zVQ%L=6(QHI$nPvfVQoZZ zvqo$D)+3$mS&Aq>kSf4p)>vuJSkKEV)FweKqq?{D*-kc z$unYzU_#89sehYmy!scNgdGAT<*E&!2PUBNrea zSXEqFB=)=%C2i8Bc9kryi~;5GdP)SFG~8xUkdReILX`{>#?<~EMdW5Ldhqf4gazpN z+b!rmmY$M7=PC@YDQc(>h^ZWUSJpoZx}MhKYb?`2Qy{B>Ur_}3#x^4En9J7=!pmv5IPXS z705un3Crs6BN!lh!6@w6B*0=H>s6_Eg~T4V=!W7B`Q|JDcNcC3{Al8C^?n<=?|~ z&|zI^>__Z~>7jYOVhKGIg}iO;E|iXQnHDdP|0uo6YDc*%&DyhFJk)z;b3L$rPT+Jg zy*BPq8=`bVNSgX|fd6$cg2(FuZ(uKJ$-;jl%iz%C8(1e6ET|a+T!P%FF4GT((*VHY z_q=6yFVe|b30if&88d|VHt&hoDSUvSJX;iF{Z$Y-kn`C2+N z#_c#q!#E2=5+f9lGf$pqIX~b1_H~sU-cJ+CzI8vhfFj886AJU<^y_jHp#S0$7rB@A zO(}T1jePUu_69Iq2inNN)GtyfJJMgvB&bV~_zJqm#M`od#fBQ&VfG_9Z2O#%Hb{xg zTrcR^D$qy8Uh-a3m=BmU;_yX)`s{FZyXp=3MZ)R94@+uvd>ucUxwKF67u)x$3xOA8oGX(${(|pMF%)ye9yN-EIambm(Jc7#s$VKxi zLLO(RmC8 z$qDekMhBPxYtl#4IV;zy=1aDI#pHpw;g+9ZQ@f4+;v%Pab-Z?{PztJiz7cXSOL3ca z-fW8O%XuDGpF=oO0|px{deRKX>oU2>-ajLo2D@$C&$Tz}h@en-zL(RDWAVIf@$ljm z{4$1*q8q!cE}o`%Ja*QMV=(NHY(5zIz9>>QlG;RtzuEv#W>3G6B3M0<^5)L|de+zq z61yt*ONHyYhfkiLjXKiz@#ij7p?=bX+LU?wwx^V>|4@ILW_R>pSi z%D~q*#zz~Q=TV_0M-*~wg!>sPeXktdzb5~x)x>Q`w)-BTmM^A)R)ZH1V=E3|Rr4&8 zc&OoztDD=C$JyTqzcpPTGP=^g7a($|8aYqXe#oom%h<8|gdg2-^C7`9X6f-pqiH93 zNzc4B#9>7W8n_VL&8dpS!sFCP9GYQ8$vbNv8kFXB_@%-3g~ey1BHYW~mftV(loK_X zbaydiTtofKP3DxB?+~l#RZvjfxrJ{QLp57QZ%oY!($(vW6c>KUr z7FhN#vY_r*JmH5am!P!VD7$e5p(6Q!MK`S&)?TCn#M0)?LHY8Zx1+-=yAs!)@)axf zV(CCHm7%!NE&(RcRJV(p1vm3~TuYsY`KV)+9&@d-K}4mMKw=h|BGS_C%O;r#q6aN1 zmlxZjE~sb@l6EB#`MlPGS~?vWpTcHWd0L7T&Ph53=*^$fzAUTyGOaV2RdcyrTQC80 zG{R+3)NQ}}+1ZuJ&Ls}e*UKoxUga{tcv4kW5vON|JohIXgvr7@WkrzT_xRgH4U2kB z;%GT*1TFDS5R%K8O7^{K?<2g%B1J&Ov_ZrEsMtHL`db#!PrBM=n3<*+D+|?Rx|+FB zM_$zj-)jra!&(1f39}EE%hkqD%z7`E9_XSTm`fqxDzB%FyzgX>ppTNG+5OfZ)>9z) zb8-{Rp{ZS-LpMW@o&+sF9aD<~G3FU2GR*rXvMS1MFa94qbwFi*3~Ak0Esh@$mRG?m zCzMJUYI$Rkajyvn%=WJ)BU;+30#9b?e1{tjRjN`1WTX~AA5C8N6XZLJU_;{CupyMz zj~f=XE#IY&3ka^X`7@z!HsXEJ;fWlz!bc1_ttx|W%nEQ`|JpI~MZw1c?#b9rSC=qP_;{HX4&s_`9Y$?qt3|JOh%8WlQ+?r!mhe=9EOHY}Y z#ZIIWlMXb^yta?ISR#;sQfgvw;)$^H+T(Z3XLek}f^E#E64fSarZa89{H~6u(U>hs z*Z(crMv7JD*qNt-O72rdaWD8MoHl)ljA2_NVU0=Cd(cl&pxTy$DzVc>UKyjxCn|k7 z@nFu~Gn`Gb=iD*$wHoNU7feoYt~^qpPrnoFU-xpO#Hm4`>-ObHPPp>*7GcG}?jJ>S z$O`9!i#YNi?e1ym&s_;f|17pAqz7MmHEt|lBdaZe$DT%&e+Urg0g1+RftPz89|K;f zeXbhb6i9*b_VH-Am0D7$P8uF5(lHmjEDtLAcACiG)PSS8wdeA_ z<^I*;Jq=J^@5sL*Uu4F@zt6P6Wj|J^qgviPUIIBa(asH;k=Hbn`_S%UR898!R!iK@ z2Ti8bOVI9f=W%;G4FiIQmZ6p#dLMhQyYXssgi@x`Jc8jOD_$BcMnPIb(GX2Oe=Ydo zjk>lpvl(XUtaud=Pei~^@6LUpMyU}k(*oa{tHCG4UMn#*V%;02X}oo|{8`hzM&kA> zE81J95NyyjsP`40^cwfKLKV`Jq2B#KJ1P>@r3v~9(A;&hT@S`I)px&5ode)Gv5RC| z1*BgOog`}F(PzLE2%gp`IOZOLVQ#28(gsZ5K2K2}CBgSL7ycyQF;1>2`E?+er4}8; zO|cQaqK*r)t4)Ky7q~*NJ+Cy6#S@ zASa-y<}l`D%MoA>5Bb>E1EW*BY|xZ^g`3?5=(GmGQfon1*_Upm&(PS@yig&ac{9VR zD(2#VfA$i$3}4|hIcyUAVs;XRniapW{$b*Jw3!d8x@}8VTY+p#*Y2-*3|ooV){$Kf zOPVOJp!lmFF?nsL*XW^2Aw5Z&#vAIQREe6n;Y~4lH#eJtn)GqRH3PBqUkO*DjDVRH zub?K8Icc{ApJ!6GJy=YF0H*q%oo`nw`GfqL?l~Mirw_<&l~=en2x%UiiB!9J(X9P`I*S0Hn7{NzC5^YYhxa`}#VTELojRWaZtzk=s{! z%f!Bx&*ZiT+S~?+sM#2G4NCok-A?{Es?5j+9am=K)<6DxB=F5MA^ro9jS!KI4q5(v zGQ#tF^nPlECrl*#1xj9ty4sfe^gPFJXSDtrM%7YxGL{_z0#Z>%LR6KZRkS$$Z{6@9 zm*rfRC}52fm9i1Z1h^J|4LtqR)+Yc4;&8YOzT&?8HfK;tWkW3N>@S+gA&y&*`0OG7 zqbYBbO=j}|qw{U=jN=AZm5;;4sO1y0ngGUsk?qSWq-#Ud#2S@U`lR!U zv_&yR$$6@Tl5<^AKJ>k398Moj3LUGv1(v(JoPeMxu{DyeJMU281AI~6$Fe3KU?1Y+ z{u&MQ{dwGv-S*2Kx%L&#-ER{1ISnY9PS4`^J>W+^zg@e26Z!7Lr+@tOL*H-L`3?Wz z`apu{fu~>Uy-2vOA^LaU-RlA&-iG=SWdGUr_itP+QjfhE5p@6bkEhdU&@a!})&u@I zvY+kA9$y!@7jyT2j9|G6<5=Mxf<`219)j-qUp4=4y`VpO2zu?SpU6sJeH}qJSx(oI zePulnw13wS`C785AglGXj}wjPe=PF%sZe0w^+y)EQVPHWZr*qA`m;V~k--b%bVJb1 zQ0*VG+uw2#c{`G;)Uqu9!=CSGhr;s`@U7V;#yS{DFe2qGg3;ibe-qPNEXT>pQ3 z_^S1kiUjoNAK(6*7M{RUs)4)R8pm^qsF+tM+~rioZh4Xj+(V=i?;qXtBth@*3iH;L zzPf>H*WWPy8-eDZTgco(xmxU>(g3YWMn)j+hkxqzvRKn&%0C;BzgzK{EI^s3 zTz`-_?p%0fC$(97)Q_E=?)*ivs8m1{BYz*4k6mxo=;HOJQAXzqc+o)#HIB*B?PQj}Y(QrU?!b}+05 z-!rhs9@m^&g2%DXf`kwAKgro%Xy9^#|ESv+nQ?j3?VlOvFLjWbPU~6<>s!mesB~t@ zM)%J?Q+~aLWZmEE!N6YsIXv=*Iz6O*2->au@h6jzo8cJy{g({qbvB(z1v0xv!L}2I zj=ck+)dlAkk=_$w#Kf6g7UN%f@oF8G1D6^&?8GF}<+3xd;k&zs4+sv4YUB8-UNwhh zD;SA*^}{Rg(*U8m zj`=YtZGXUmh~NiDi@-f)6}~IQmlwBEUe4}a>soE=@E9dN6qY}69ypU``f{;A2%hP7 z*~7<+onPV4?5_H>T8c4>w25{2c0fdG9m5t~Iyo=1FJZYk+;?2t+=WFp59Hs~LTIoR zYX^sR=q2c_apwuL3<~P!j|CU=>{nJ2XR5oD)T#YY5`EdHRPzhef2svAwii^PCz>by z$?Ge}xt*|3*SX%{D)aSZ(y(GT}R(#H20cwh2Qa(DjpwuXn!cHcc8!C zPlzAM$N<{NPB&)TJmYXF-}6aYbvM_#zr|5OU5_R_IGDo*qb7Vi;WOMj{a7jCvJE&> zLQ-C6H!(P=egW7FP$L=ZL+skKx~cltruWDq#Cc0?PQBT4K#XcI>T))PixDqPE-N~u zy4*UC=GE!^!E^RJ^sKW=%ii-_cPjT}?5#>4&t{cm4-F7@FMaXQ zL~^H)PLGTfGF<9;_A1JJOg_12?zGuV{v>kMvydcWgBml4d8H>rt;BYu0gAk@?{TiD7}!hMF>l(+>d>e&&x5vUB2=S20|Lpikh zG?CqW(LKgwWn6=-=P%Rq2Ow7RvKyP-Wl*?|Uc#N%888PdhI-vBG%~kw| zt!E+y%oVZl4`U_|%3|J7zhC7a*h>Dpmo-13__ijtFh&2nnXpc~cxvMJ!2%&~)<>HY zOTWt6)x{U$3q^^i%e6o5o(uB8_jme8*(0u4_u9nd06f~PUz?@`TT|OfX34$SlPZW% zHdqVVw<}1RDxz~J9|~LN%l+8iA-f(|SP9zARN&$NuonsVm$(r^bhHP?9SsMGE4r@z znz2e7Rd>mF`%jyqj&l7o&OHs-0R`W$3w^wff<#*Yr;h&Zoza9VMMvTF7AWDMlRy?R zoG;zt|SgTMyq z(@hoc@z4mDs#A90LdhRzC3N#uxRTAl+>5PdqiSPJN>$S5Gn*Oj2c9US3T2$*`2}{$ zWD+Z_(O+(rX04;R-U1nim|T_gP&dQdWI?hUnxp&-5J8orxMiCPT$ep`vg6!-W6F+l z9@VPa(}56e_}QMUp4#@C@pTc}gPrRVgZoYcyUj~FrD)UQ%pX;5L>%oSjc>%W+8x@J zDHx3#C-p3{2`HW^?sbznLagI|Sh=q0cGbUW>dP<&!KbyBfMJ!>h=tMvD}Dxz)&{7@$v76ODb8aPF|1 za)lHutWGae@#n-|`&>X!ubrzLg+41YHebof_o^V|B-#lY)W3iVV{$$X=dFP4dauS* zR?6jQO+zx)mo0|*mbnqY5Dmzb;t2c%OuO_M9R1z?xHeti-z zH@sFm>V@VD8$99%_tujJ&Pc(P56mh}%;^~49?q$#hR%s>#LKyHzTR3**aRPEqHV{g zviCwM(ioj4#cFms9etC2Mk-ygf5?8$l=;*4egfV4aQpnubnV3wcj0W2%oa8CPz`T1 zr0)QlI06+Nw-{}mGoOum^}C;Jnu*%e!YoB3Ve2G3pI8W1&pVZ>^pD5`tFp7{9xvpI zY-G4NkMOFsUHdbs&`>N>zGlivtzXdAm&vUK%}bOg)kWeD*G|`v5}-KmC5(JF7_Wx7 zrgJ>3hILZKy>=zN|CMg`F*n;tjQFi5cwhZZ^=A&uj8%H3JPd7^hf9o;E}&KIuJ zVrT`6V`Dj44JYn<1+G_1*We7j#PgWOok5Rvn%87VIm;+Z5+mcXTv6cgdHCX_Xzz1u zflWLYI|b1Vx(9SYJ|ut>#SykDAP)LH!&;sdb^ku*>2qrP;iY1dMS&aSkO;TW0V&-&3eEh7&v+*LVR(Src{Du$KvcY&qc2lsD z>Dj`HCbrfe%|t}T>)GBUFvtvM9VwmD=0d(RWWC;o%o(@iT9hY=9se}u(PFx~w9OLu zL-`=FH>3ZQb?Apd=Yjy5(2B!u40eX;kB49U_8X*m3K>`}ty}S_#ub;dDx2oXts45# zwiz@_Uyp{cMD1+{7JikS;)RC)*o4|gKxF9$;qN6Xp~d1wG4pG|l5cnGpNhPl<*on@Mp-M^j{EM1ebS}f^^*5*i5;Tnh~7?jZy zoxAU7Md^n$p3I`|5!dQ7T0EZ5pQpN;v*{z9R?yly@56_o%sgR1!sbE7)~4P}W)$+2 zE+hQ@S9)F+rP)xNx0)}P8uC?au_wzjSi^h|eZ(i^QH6A1)~q90a8PG&)t!7~jWCa% z;bV{(o1IQ&n!iLkd2era3u@ARXc5>w`suk zjO3UdpYhccLLd_dbLnI2`860fMJUX+>~E0YT>em*I-fVyuMvHY8D5jSrkmFDGk1u# z29m1Y)IYNkH)_<+!Z}SZ8WueU-s}{QQ{hvol2hYl(mwbStr7fcs|1J5WNle2=6ryncAlEf^->ThvQ*I zFZ$K}_*W~f!cJJUBF#aREl-PIJb@5^-6$(wul@Flb)=rBGFFu%n_RI`(a~$QBSdbk zU`AwRF+62=Z~*6R{>K!(@q86~c}xt+6isXzrMFK+bmR?EnA2tqgJ{%YT=KEhn3H1Z zduIxkDaq9?nz+)?*XQ0At%sq3a(k2_ZzHtm6jh17KEWimPEzB4t92*jWlfM*D?5Yp z$+EX0cuF5&6dl^QlTXH(ccRVvxDNyDzdg z+H2jM&xQluv@cP1u&UoRN?O%X^o=6jl*YzEX?-C zk5up`a+lj5vScT>HDR-q)_u3lo_&wgx{%yZT5xh)Q+$PYF+KGB>d8cCa_-2t_~|q` z1J>j8uD=G{P_RU=FL>`9Nu$b^VoE-ZO)$jY<0CfT(-t;0#%VgT8L@>M580;&8gTN^n5qwlNv^AVf*T9PRieJVNlvXW7MkKX%LZzt02ZV5tpWsJ8+8SD*hJdp7;|} zF4b3=o?}{fq;|9KSJH~~q%{|o5>SkV_2K>HL;gIPJCFDQ&}8droe}Md zbad9F=FWsSvn)_L+pt_5T!hUfK5LgvJ4|4b^{>Yy zmOQYEZeJ?SfNSBh?yky$mG^zB*)<<@^#{Aj zdH6~^gk@rN#+2n8sOw8V0k%&BXvjkDz4KVm)aKwHs0yc>VOqt(aYD|y!lZM>nL*u$ z`f5UmIGvT~owpRYagJGmBUtq?!s^!%UuD=qemyc|cd^jzl<;v1Ys)ArKr4rY zam^Fm(fBjtw6D=l8%FJ{DQT}3Ac2auK;Y1D61*48eI(*E(OUMk{Y}j(cKFzJO8;=V zY`w=H%u4vCYDn8m_2eJ`)39$n~*L8GI9bB z^{OR9uJXb5Z*kpv;_VTu7{gR|PBbAM!K&2$f|X41s4z!gUZJ8w=)=+t_9Sqx2wKx9 z|144739%i%(S_8*qs0$)^)+nqxYe$X=O5DX%iz1B`v+y>L`Kut(-eome`V|rv5Nu$ z%mKVLd-r@Uc-D|ey3B+}vyJ{y$*%U9?bf~#pAqO~$puNbz=?y2ya3KOy*PCspxLdyu@=BrB*kGQ=n zb6jh-qjvVWfv_N7t&~0C1R@fCmdkC)a*e~%TG{lykE2AcW9e5wn8i7#s~4N~YE?_m zb^KL^Fgk7XWE5%+@*cjhGx#e~e-A|J22y`T>NFVV(r4|t>-8xyH|BQC4&JjS`J$uq z>rsWeSinTXAUfw5}9GAqhI%Z~bu zHtt>&g^VmtMSU*#q)eRP{H|8@6~+olWAJRv-8k=OE*Vps zK9;2rivG^%=T5RX7pB-5EQr^wO!B&@+R3zI>9hFIENbqA;U%~H4JjLL;JnS5@WHGKy4ZhfBu_H7l_@ch2mc2w!1m+@jh zaR+MvokM{?kfcrQ-Fw1(u}A6IA^hPCi6J6-+4xpr}|P12KOe8Zx(VT>`Z z)$D9>&u+h|o4osR|Bhv$yW$7_X<4#3soYEuC~r+f=2kriaZ*!AYc^X3;ER$!d93mF zB1PTak~_H=YVpu~_4@g2%w{1c`Yt8iv{jK!(+XxSDx`a}yoO>zhR?NjYT*)xv|Q?L z(M@jaogKsKzJrqgF_)}|TETWzz~n_LG+SQXjRuuoXYRR9nYeSVGs#ZJG-;Kq?8U(L z?l}bx4w{Vf*KF;H=i?W&eyY#CD4|e`Ar-9SuWNO}AA%}nKFjTWmA-iMalNnQs|(q1 ziH@?hS*-Fnw`8+&pFIBTcwvUG5}yB}oLg|`%v}Ke;IjGxzOiC2h`Yt%xcOXlH#V|< zf6nRUT}3>9p6C%L%|m6wE$kv_0&0$bb5!baC*`4K(5&VeejG?Xa>YmG>ccl@Q8i&IOE(Q{@kwzM$U3HP*upSRh%9*K;bWy0R z^scUhZj4w?W|SQksSdpbI;OBp8&>QLE(MU<=O#ZDrhVT|1EEojR&NO+vaL853jix0 zd2)J|5u3x(^|KaIls$yB(h-n;AHg=$<15#W&i zu}`0B3tZ+>D=c(<$Nd>o1ld@)Rc4>ru<)v4E(+c>T(_3mE~7AaTv(9m{GJ-j-Px)1 zBNcQh+#JU`9}#{2Q|FMohygBZZx9ZBU5Nsx&y)K(@O_1+RLnB?`L+SP(50h?P%70L zx|8+Gh#?*79b1k&dWP0R4Xd+}sj|)4m=*u|-c1UC+B%_<`8n42wm=?UI*9W|J~VlR zJu1>0Z*De8!!C;!$ahcZ3+MK^BfVC_`3%*LGxH4xW4};iL5l}}217TIhk^j2%j_aM zmE>n1Zyr7bWqSzcHq(S}uFe2D5i13O)~uTy4$M^S%;ih*Ll`D*)vc{9@k8FgWAdv@ z_c7GSe3Z;NrKl7Tq3-VP78Z7NYg_-!6aTqcw2y9?9UdOe)~W-qRZ|+zsX`qzraN5_8Nf|sy?l+z-Dx{Wy>;ftwJJa? z65Kt(!GV`klWw9#`RADaoRuM~n>iO(zGkgsPQmuf7wwk@A*!gih!$QE!zDu+y-7%$ zngT~^hkgbYzb*#gIzKISMmbegpPS7P5Zu52fQX0~Kc~JO@NAt12&Eij!!@1IKDDt) zxVnw&-!(F!s_#D}eLIWnAJ*qjThRCM`r6UamSl!96?0Cf6N(Vt(XEX6tDlwSzI?VT zg0^-RVxG}vB8BDft{YXzD6HHGan*#oc6%v-(f}^}p!}aZ<%eBmp|~8sWC2Slq|9z%fKo1dNF$9A zp)xZ8>E6BMwi1Dm%7jp6YeJZ%RI?TMY7d-6cxj}cQ!&$DqrBR~esTujmo6XKwBn@` z^>t;w;`bj-+Zqj&@*(Rx<7M_qIjn9ApVu4sNDGqQ%2a0ALUUw}OjWDtDmt+OqVk|i zlOh)r_bBWsn_MK^*O%+ePsw8EaA}Lg>SF|(Df$)=_I3M^bf1IiKHuJ@B8ei;bt&NN5+Zs6 zTH`5vGtM7A^?qlRTwNMu9pWb#GSUOsH#xyeQbi5dG_6{2Y zNa_9eJcatBYXPSuZ&-;Po0Kc)?SvYBldUcW!Iov2yoIytidE?LwG)!@$$S5%nfPJ`M{HXJ@X z3KgeU+3n0b=i_*jPd_0=R`sSC{|ynEExu|cqgp@hAQmAQrX>LsGRTs>AY~M~S|J}Y zodaNn+8rUjah#3)!lhJ!lNX!y!8rbU6L)Gsn>+WGNoDA?YG2J#<~2{HJDy<%a5b(( z@=)~{`Xz9x27VYGvP97E&O4yt*gHb*5AJ7c$K)DyLw%D)Gbmk*R(f&5$sRPfxT&Qg;i{1Nc|(@A3hRan_;vSjhpMW+_c&AE+ANzzn%{p-(ccF)=u zHPU*PfnA_puVXo$8v+@h)#5202u#V+g(VwGgnl$BMCy;pId;M3PNWkyDP>ssXX}4T z!|eeO+YVV#4eZUt>bQnXxuq_&3s82}3EdH0NEQ zap*H^GxfusAZub1EldI!aXqccCoSq6!qJ$nd-=-|V)$Yw>1&6c1L9akDPZCS5 z;H&+rwi!@b_=M2^z{a^~Gv=W=75oAtXO@;h0Q(hozZPLak_pPuYetCCg(wr8w5k(} zvBv2G$2r-w^Ww11Nm1JUT0KYR{+aD&GqRMNqpoIc&W22=4IbJDtZJEbv}4;K(+xsN=DFJcxU~Y6wJ{jUZ40Xp{2o5P_QNavvS?5L z_yh%bA!H#P7S3gePvjCJl{UKcP5;J?s?n@+{N4Ee)h#Q496bLo<*S#K!1sVMToLp5 zczLO-b?iM*BJ!`=D2PPmLJ*q-ev3{1MOcAPxuPI{wN=Xc=Gy;78#(FjL03EIrwc%B z;eSCjE~{Yg0fo&r2QBTK!$0dNe)T*bzNPXALyEuJFv@~!PIvSJAN{71LX1NeE)7iFBNsD_UUm1?JRWD99&+eWnxI|HKsGbxeQ^n!Y`T8rc*=;=P#W$C}A zbdmcj>Wmr{2Zy7xHZ~g7j)zL%+1iE%ph2M72QhdryLV3A|v#m0)ImEc8!_{xBSlxyL%K@zXnb}OucpQ8mc3fNc zmsR=IV7Wa2t!~ziPfptHF9S(o{cI&m-z<36Pc)A%>F{A(fO_L8Nt2);u*TKw92^`> zOntN4|4n!eZf=0sfyeQQ@IEYXl6aA&P#l4%LF8RREy5zL!XhFf!ouD@E(o)?Vj3nc zX1V(@#6YIksaV{2Kc!3{M1UcbkeIOx?*TFXKH_CBd-_tZt_~|?LXD61w$VIBqy7#e z5#5cpbUTT5p61W5j3FVPXPK#DKy(rU+4DMi)7~L$bOKdqVsgTDHvkOX@!^*|6-3l{ zSJ5X7$Vn487Ytb&NdLWE)QAf7+T0k+r_1Wt*5PXV_)wN+r-}GtXJ8Z@92hiV$jfcO z^6Ij$@G_73tL-7MJ&PH41|%5K|AH7;go+jxGW@z+3v7`Pb@c3gcJ}M{^$mD!GJYUG`#79P)j)PRSb`gLXOQ`R!B-Q{oe)#^@zn)Axirn%0Bx7Y#2bb(E%(2 z7+qkHerxNB2_%rSIz%>2po#lAg0;hwdI;>{!0o!r9Svv^7GO?-))#bjQ%Onx2U5UJ zNI06WRXaRtXKwNFV{u!&)R*XDeBO%~5-)TiK;?`x=OPdtiD8Ys5@^6K@gxZx$r(n3 zPbl0K%^aML>FS=Dal3F$ZZn}zdX*vSmA``M{7@3iFdAUKp#v1IxiA6}?IJV8hM2Si+d7Tr{>Mbh)oJWAFD91UF=$6+`Oe2uzQy`P{ z1>{g}fe5)DO1caiz7YLm-Ef4NqAYJ~c+lxRX3D9bii@2hCuZ_gb$D$|gIVx%-j zDgNmR50`hxlkHF3a16+;VesWu}dHij;K<&Xj zojd)G2Zz|tjveM;ksZ}h^|rai(Wdd^u&~B)G%aXT%olCY>jORJP0Cqh#`|bO6KZIF zqH|o}4Yg45betMzGS`px0aL?V{7&3ezOrP74-v@WOCcZzBOZ_{Buo5A*7vCzMn2uMmdoI+feg`ITVNVAD zK&Qew)gfd1DLw&SB=@{UuGVH*kME$v3?Cr{F{I6O@!mOBrv#r%wY_r#{#?zwg@UCc z*vWJ|uoykTT8xQU0P4U=MULUW&!c0I=)zv-;Hcp5)tfuLH!?#?*%M!VUy`*QcT6&* ztqlS7mGuB9;e;X!r9?kk<>KZ?qX==v@g&I}T23`>&f}9dlM;F$W)c;mGV~(U%M(J6 zA=I&5x07j30H?X<=_#|_uEIxaNQPkH-bY?ivjqO0-0pfkQh@2DiEvy}ae8Sbb~YAi z+F+!Hi-A;Km?;MUQ_z`SiVeU1n8H9o)LxaHvt2{^Fi9Ep{qfXuySg&|u{BL9=0~~2 zKMQt#iIP6)Vt&IXiuP)kwYl0rN1%4Q_YatMIRHn04N}HU~^8n%aB~niwg4do5`a0-=qbb zqL>pUm}iLk&i^dc`_;(|z55Qk4Y@}Jm)r~!a+exyicX^p;95kh(L!Ul zj9V>UW(dB|kn?2sR_eP{lhfQ)R6>K}JaD);y&CMOohs9h{s_B;G6t}@X7v{y^DUIT zyKidUlg<$2k~j6mzGI7ksEz{dXYR^8SK}We$k)VPCq_%*9b<_pu9jd9QJ^W+UTDzP z5@p6+gC1;R&v{WG3(4iAzNwv8UZq=RPav+fUS^oQ(TsZw>E=+k=xuZ+=BUHS_2Ls_u*|L}X2A>2$T1cqohUElK4)(0%hG zUHC=S05N?(^%`c2`Bwq&uaCGne&Z{@CKKn}xEeXQ8x0`1K+$MP7ze`RxgOs(?ut9Wc1>=ZGmb$q=57Jp_s&?)eW1@+%>V zSeddfhsyk#(h}i;XQXcd4bfR4*RM+7AFiy&)@tnDiXqpF64UBL_8u`@B-no%-&42pGQe< zi}xm3AIXa|Pk8QtAk+co2IcSNPck##?$L8}azZ*g7xDIKZEXRv1&ku=5Kz~+rYnf$4FyO`T7l6qBET{g5B-|Y6c0Oat@W6|cTt99iCavmUk0bY>K99%oHMV$S2p`{H1k6mP>k^=9P zj~2{+UB_SDCMLqe1Tv*PRWDN^VG&{Pt=a8*r(*y&0Wzs-+f6k9o&l1Mgm!?%-9?-~ zzkmAjW?g3W`9!JlB_5E(VTfz%&&q=#8vFUTqcpo}kw0z>li{)2VCDfwpzY{_@ii3nHx zaS4ONfJGNV8^86h?mu`#yp{d%=NkO_;kz@snvwyY{PPk9p4S}92uRGY-@G}J*-O**;ldAA1AZUPM=wD=`ET9!5jug&lg#UeRu2mdtn2A06X11pM|?R&GnXXStxhTr>o(%r&iHL z3SBe+M)FvG@zVv28iH`gsp6b05W_jxxVZS(lte|uQ-G>~TpyB^o`kq13WSK;)Av+6 zOXI~r4a!Sa%-%~R0N-D)wY9wFqAC$Vk_;TB3j9=GfA$r?qX8?l2NcdQuP3*~J~3X- zh?(|6w7B*crz_i2b-Wx*vTc_D-JBjUKsxX5#XXOceU%aJ92Pgl93>aXLQBGA`8N8U zpxYR@N4S8^5(oh*;sAd#TC>v_Us{5X08VxL9rt8S2I5$nnOR<54&Ry%1!`w5Uj>|m z7I1^h$-2zp)ggoSom&^DN$|>?;{9s#`daJrBfx9uyO2w#9(JEIy2I1YR8YMr#z8+r6&`F7+6GnD+4jtjSaitiork zXRksv{{)eMmnFYc!Vl{QHa7tv9w;e86C$Fx2ao~McG7{edOc_52$veTi(~KIGv64J zm&AJrk`m-Dv~R(mmW_fJqC0Pbzuo@BV??}%s4$m_@mtjG(EkEE6oJO^e7KC5_p)5t z5Y@DPs`-{fEDMP!iXxEUPS&N-ZA(3R)QhM(U>~?Y2S_t;2^Ak9kmRRS)Ju>pc)}+G z22IFxWR8K3W+ya+jY9gAc<;pv5iodN%ml0Bqjb*YZ zId*FX6%)H<^VHNUe?{BBYR&&2R*#S;xQni^DcV$FrrsJ^rr$V;qOtxgBv6Ze@xswIx=$ue=1+^F2vdR&9_2D7tO zR?l)>MhQ&lwgze5tcBHi0Cx}((Qb-N^A&3&EywNT>>Ic|R1nL?OH6gv^e z;7knMSbaYP%&%F|ml9>0U}CmTN6@SHGiennoJ^A|zUxm1U0ZgcjGY{8o?hWU9O29*LNoF zV4^$mGMgG7ttQV6)X+AE$NSIL@^SZZRZa%;MCz!_G6o!WyF$PPbkr6!5wWPRBWyX3 z;jmeq7^k|iQZljK#&i1MBEQbYBs1QFs7QOka`q%H1+|NE8C)kd0Cc0c?KD82d2Ff` zLN@@SRUc4!Av6(WDy?NPXU@NN+pYiwOo*mrHgnuw8Q>;K0pC0Q5-1ShiAz-}Vd30= zR(>bRbj_D0(Tv+pXM5SeBkBAqFyW~w5FIUR0!ZqD+p=~Rd)sfQr1fho=cpcsm3-v@ zj@q1526;DD>D(#OU}0hM-g$1K*1*FzokK~hLVuF7J^=m1+haR{p1zQ|gb+e**NM$7 z(|o6;bHqwdkb{C3fI=J5k<^>+XK)~E)i|vqF}CV@(~wSMH-!Rh4xFY9qcOlPe&Td6 zYS#D;na4sxQXSzd3;<~w!7C9moID+$q-EdWS2milJj+|XE>D>^!LUt?rncC%mGpVm zKV|C@GdjV@*t4JWZhE$eW4M$&5?~INGy%bfj6VOxP1hqkA4@fFRVgb)V&ToV=MgTm%R3jZLcf=(!jY30ugE z`?1>`=RTzpRtpmWT4RJX+K?*8HMZ^S&@)Ayju^PTTdVb1n}Li?xw~g&V4FUqdj(IZ;qeIiQdlnY4IR+dP#| zY6MEF6}S?Sh1B>b&KlJ?Ufj) z-Z^%OxBbK0JvHPDOdNED2st9*LdFKocf4gcMc7-<#AQ!{WZVzZ1f+3h%J&(oQ_{bF zpciIGNg=)pTi>wen#&Jpy~g*>EDoFE_F4&IoeJaL`g+e8e$!i%~$R#|E@ML7h?!2yvhtr#`6 zOW;T^!I@_-t!A4nHxun0Xo!QoNWg@*S2}We`UoTm);LZlKcb}c!!5%3vAvOEBu)H2 zL^QAlNXpPfeK5pibl9n2UVFZYDTRun+~+^iBQ8mdvbB467T0Sq^n)i!;YFml5So@y zvS@moQof0o@LLKm&udyk!yY|J)lFsE&+;@#ZOh_$s~{`J#!<~hwIA;<+4((j$;d3; zoXYe$Hg$3ka;*|Op}R6m%(A@12~mZ#c6h4C``p@a)-HOhbUNZShVa^fU0-s(QL;#__;Er_r+Ha2wK{6-$=tdX#%c;JIXa7k zc$-FB>VT(>(PW7oH`(L%j&Dhf)YVzf&V@ljeS8uo^aEkC5V`ZWaVI3aRxFIM9~Ajpn~9 zS{teZYNPqQNU(o5YGT0YmN4Yr>)bJ{m{Ky+;Oh|8K|=*L{wjnCiG*`GuKhp(xHLpv z1Y}ljxL?PzsmR8Y)GbREp|)dXa+)35NLhA~|GUbX{pZlJ>^iv=d57?NtAMnW-$-S< zcNOb46vR@EDGWY=@F-&E-+}39HMnUXJCSZMhw-a?+_P&{^GJBzaV7~=20n0A=y)e5 zpJ$TZBA(t@LONTXnoktB6jm_q9*C4u+9QrSd%8AB>;6@_rqh4cRj$+r@`kBS#>wKr zl^jJDz#d+TI;DRzyBj2yE;n~w3Baz2da^{{?zIBrygw+~xf7S@+CT(HgUnDOMX@gw4QpO*y{swL8EWr!eD z%x?xzCdiR+HW{n|qM*@_baz}}Shdr&Xs1+nv+y;}X7}k~$DNy#Rtl?gpbbkNw9; zUSn^qUCC~j0x)ijOtR>~`c6#t26gDR_MDoSDg86;cPg?}5o(vz*83gsv3ET+xX@0C z*ct4Um2Mw$!Zs8k5@|pQLP4uph2#UY(#0UIhxn_MY?c=en7YR=rSF25*9(0}1|R0l zFx-j&9BVQ0P3~GseKaroF#fUA*S|UKjxAfT2Z@`iIb4_8y0gS0l<{Ib>)wKxIvv~? zzPi({!-iv)DwnPm5LJ6JF}dx6M^5lkOJ$Rx-wTIMb=O?|m@u%Ln_2=jHH`NvQ13n4 zcS|PL%!a*f$X8_g#*EVAl5)8l*ws=V(Q{ll)-&TZYh*q{JUjC!LR<(-mN{l5xUB9S z{+iZ7W0`JV^Q?QYt4rP47bno8&#vr{XEv_R(5ivo&K7n1hP9aHs>EUzk?l(y75AO^ zY55%Hs^>htCD-S=C2$rG`A}_*#5PmY`JWOX!xx+ zbn3)(j;CaH(ZSQaW9Mlb4E^g_i{vPfdXc8E}H3YK8j>229K9MWg~- z1%yc^0k&`k+e-54RI>M!VPbcXR(e(MqT?u6=vXiN!n30 zylojzb=zwN`Ad3~I9A^@>u*``@{j7mL6wecJhr%3mKelxuB8^<**kMaR`y}w?Y#6O83XVq$xy5HSY6+?{|;+E}{27(bZjIApqX( zt#hfGsGBA9+sl*#0yw3*u;&?v#VpM)VZb>fIeE$i9>8{_Fp-9>ovzP_tD8PY=j~5@ za^5S{jc2^BQv;g)K=#!>47l*?up_b+`{nBe>qJaW4SQju(;gXGn?Ia!#w;CI_I5st z+o^th+@&C1R3e#i=n;kMEpv2Ls_Le8c@&~rmTWWv2<;qK2bYhu<#&4mx-)?Ur%3K$ z=(c@--QX)U)Ml>b`NL`%)^eXU0g=f{tm+Y;IN?Q^c>5if=17nJ(c;x3*kdYH%xi1G5o{gUks{oZ6E06I}X!Yi{m|IR5eDui}29 zdMj6udTw|)z&dowrg=MsrwnI(Y8-8pC96CY4~#rc_CMU1v2NxhFl_}pWYOAV{w?|Z z>FeImf1|eEAeo#hT|At?Prd8!TxcwGO1|ts z6-q-52SVfytE4?~?yC~#r#HTh?aA1t7YWM<6(f-$Lr>Y736Q#t24D}mhsr*Bj7!O> znnq#ZpXPcjX1TK86Esp!FtxYwcCGSCPx9tlsqm0$loH`pxi^3U%Bp^2*``6{Bs(*e25PVUCYl&N10C)ARqM1jMVb3DD zZMCfpfPK|nT)^?1#V(U&f%xooH7}dI!T+TaH0{%RLA~n+9GZkiPajoxQMF(WskS}* z3Y6Lcq!&O(MfS$pGB8ieEIYDFF7?pC3);_w*bcT@Htyo_?kGf3TY| z_c$PR>FG-V_vqy0q-?NaH|y%Oa2exHl*m zHzfopj0b)~=3XMEW4;xQH*t_nfqCPlcKh~;8)=6oJOF}n4_<7?C}Bll;GT6TT^e%0 z$|9oc;l(<=e=`sLEqKa z9@)R*hGO$8Vzl5LG-gv1XaY8v&st#v}_LpwD7j?i5`YAetkBt%Z( zrua3n5KZ=$;3lo3*87&Mg4C}J?fG%@@W!%@oP^%O}IAn`hMKk}NX~Icf{0`V z$qYkAa?UgG_PDouKj$1c-{0?D>;2=Jg?IPeU0q#uT~*b6`)sdN78=#(adFo5tpyqI zr)3}ev&#Fa&FtiA_tHETsG~UJNK*}#v9QnnW=qIcr2~6+4?Fw)OWiwbR$Q_X)gnxr zzu;>%rrp=i8ZfKBL&NbQc!dM9R=~^Xk(!y7Y(z&0UicgGr93x-)0_Pw_1Iq@?#lwh z75?2u6w@XoWNYjA{Cvk*UyW$oz^)e)@;;mnf7}xy1MOsHaz5V8hn<@70sDX~v2oBC zvV$HTF3nOlZt!wF-d!oKH+ZD>NNwi1?R?<%6km{UuY$lkEE0BjS zFDB0TRph?Zn|&(=dG6$UyxWx+pUjwW#sLm4!u<}*RLDUKDKa&=&f~ZiWf3tzhkD{m zaiVcjaScxq8f|=Ub8ctpR??k>Gwc}GjVkxG@p%HO6cJw(YPgiI(AYRBm1_10k!(b~ zk54S@TPIe)+^I7#QNjXEAs$|b`#&-)l~d)QwnzNm5>`Z*ri3KUu!N0O*sH#qZOwgs zk0WX0B&qs!qHtujWub3vKJ+j}WwV`bRvdZ!;i*2(hZCPO*6O!lG6}s^$2PJ=vS-~i z!Eqnh&!u@WFZp-Gw+6)<^t!(ro!&DCBY1K=`s)s<98~74Pnq~T;mSY_#7=(bzl43O zx^$TIAb#XDh5_uU_KkPM3yM|-erd4avlS-p1?=0-7F8X+4AhqI`^iovWAe%Is?d8}(mT_{@PubuVxzSy2vNCi73Q@HfrGuZ-uySf9w8-OM*$v+h{h;mu1A&V0<5)(61wDg=C;U$3cE5<>AZvBvg8u=r}R0 zNZ-Q&zr!p)d4|F$L#3&CCv*og8MR1tlFgcXFYACcFFgr)9G+R`5xqVn=Q^X}TPp9A zr_y8J0T;*lB|xnxap4X3>{M1X+pW}y@Izb}#lf_#NHaxqf zq9-g@As-{Rz^175@-Kh+Z%E~@Ep_caHB)aQ&`1f{h-5k`D7j?3!!xzp{jzHF@)DHk zZI0ZnY*asG8nR&RFU=N0`?f;{`V8cYof4k5?dC+$CTB3h%a)R2zW?N!vtT{F_Q)R% zeD5Cxbi2UkJ(AIeKOh^Sw3>$c8u1PoUaUp5kzq7#%sxC9Pj5T zEq>fdrj4Y9I6V$K{RWfAMDK{JeUCFLv1KFP`;+l9jw_l3I9!oU&Aqot$hfgp(loMp zw@A`ed*sb?ZGjS5BWm)F92(dg7 zO3iq5`EbD1t?=h7iV15xGYa8dg@>o~zTZl&DM;poA^l=derKakH)$8i4 zOHs59hpm1&EKwvR;cBc3k+fM^sX@!~t|&87V6tD!cb%p~CW`=QM-Sqd9KmJwt|J{!MS^lp)-ybK&YH*&81Nh(n zYZm@@Gyi9J1{csKR??SG3vvNm_y30XKji$|N&c3exSip|IzJVq{WfL|0osWwil&LnUW&vwfzfVItQbMdVaBn+Nxj4 zv&f|;zmwsfc0X^Cb{afmFSVIMcEEG!u&=LweieSQodtJa6BQPLurfJ& zR*y!vfwA9^OMGD(sEgKPa_K zAs&8WWR@lyk$RsU;9O+A=hoE0)>=<@J!&eX!Dn~4)$gz`Ib8Ixl0Xc#PT;qGR)~mP zX96jTTJL^L1wmzrp=V2bFg<{djD-C#*bv}Y3WiI8q_fkdfuf%t_+|ra3$V3uy~oBN zl`jTb0)&adcd25>`{Cf!_`$EcXWb3g=zsCpV;+>IAMb-Y=PUwBpQA25KlEf!9J^6c z1H?O;o%mp3(;20XZoVNr8`}X0~1d@2%6TH8lmj+7tXfTTr4y5UDJm=th&|#dif95jFPQ*3rw>bEsmk3VQgGhT) z8^ENymIg`Z=Ucs`VLlI<-Jw^zLKLJHqr^^h1!rU!tD4X7i29%_)GSagudPGZgU`C~ z!=X>PF^Hb z+39V55$(6I`%Vf;RmizsMS4`rMr`U2cn2m*ow0!?K4Ja69vOavHQ_x8J?>BgBLj?{ z{Zw69+bjmJl&NWPM`Pq5fcr zK&;khLqg4LKg-l^%()%7ouMG>Ydb(SI}<6xBfe{8(aKJ?KL@OE6DYA}>U1mI0; zX3v?=I=8*7G68lKP)Cp5{f#Zlp7B~=h?B2xwDR{z;9rL)KWUv6p7}*o<-NBG2vch~ zrHJ>ThMHLr!DrU(=~7{)UYAJS(~>0sr%n@A9knqMC@WLRLJh7#G)zsV0AnV+rvr+hrD<*Cq+&KcTT0b;3kWU zbW#;vKp;kHGkTUCMX+zk+LPA4p38JnkvTF@Ix?eEH>1fDfT1o#k&oIlFgfRQ$%4i{ zZl_}8bN-?LMA?;6T&RpAUbtg#6G!sqUElgz_*z}6pO4nYf%a_9Tyr7RQgo{6L>EFm zAcpFxn%Um!6Kgmz&YjK&CTIPFEbA9b>--gRmQxpu<1%=IzigNtUlj51>Nd`;2vL!M^cYip#jt^;ieIm5Tfttots#^W9yhK1SWA8Q)&QknE8&GKVR|OeJ@%ibpjD zD^B0#9J2p0&^e!b*U2{S7B%@w1!gEf>8e9>zR6D5kIms*^DZTYBOc_4?9d_f)> z9Ul25!am!a5SCm5v?UiL9c^$tD>0aK`fJrJ7eE3+m%on~>! z|GVPDb3VFT$+Rn)jEstx=oJIZ!e1n57?;(rPM7NK*!a=Q+LRlgx~SqN7?}RIij^SR zi0W}*UOKQtjoihMfewP|oZdT(lUs#TcPXsY=xW2@0bgV{s&!`LD)%5hTE|jLh@Ty9 z{gphYJ81>+f45o%CAy0gcCt~ox5TbT&;m0)CgP%fBDL}J+roVRUID3R*#1SjlPxNV zl#H<|b|z=FIqj+4EZt)LLBtuy3pvjJyUg*8zqHS|d`>4C0QZXFF#l0`DC?qdug>ZI z=lu)tSR-9)MWQTJ!DKu4jPLHU%KbZ~o34~rtpCf*|F#|s9EX~JzAyLRLTESX0hh9? zFrCrd%d+IL*B36HPWqSAB(9<&w{B} zuOA@7i~P%VdpZ@{VND#4o*eb~Y8}IURqrxA0mMfPg*-U|7wcpdw9FRdr*RK6$2 zM}D;~^Zx~62s4#5*M5_Tq7^eVGD=EqfNagRY|Ue!ItUp19W0k(*pCnuWZMCN-bt$C zi8H}^QDXySe-cAj_Lcn(|4YajwUWpOP^yr^)vN67t(N!@6?2qY94YC%1G3&{W2zAF z8-UuAL_Hh=BZ0rzDz-S1Vw7rE{_eH2Ion6&hXU9W10**)*v0(z*8v3qz^v~w0qWEx0yyk4 z;l25fm`6L<8&H6c{he0;`3P1qp$@P%E5Ggx#wB8?)#m0@dUuGh_u)21YJgGU%*Y#&^-3TNy#(;xm5@5SBg53A(qq;5tK3#?MRoN@0BHOHm%AZHMfhGj}tc;+<#1+2hY zbJKH(WwQ|67CPO~ode(0>vYILlI9aAZYem`=5U=cw5k%|mcA1=pI`&?p@ckt4+g<085+)5E z=>WgypMitZ{rHKy3UKy$M1=8!6FT55xW6f`0t>%9)K-Zm`mU9dTHo!AR>3F=%#~M{ zZxSXETwQepyM&CEnx8eVf?5Sqv zl3=N4`h4PzB^P*^f}yx$iWfnS7`U*Em@vt5IRCcPi$>VRM{~07sqHtYIkf$)E?UY) zbTI`u!@6ayLCb;&2{?Gc_v?g7`qaL_%{G!@W6yJLzr@_{KutmzuZ~{`9301W&~jct zaVPEq;EaiV!eI1w1&TE>R~H_E(eJV4VNTH*JQR1V$4W2+Jr-%7kMJz4y#h|OHTGj? zk|fbf@n#XWJ8>{(TZR6}O%B=xe7A81ysXVN{>lyNBJA^@J#p*uLcuiT#m!Q*{Ic23 zW8o3K*K$nh)9@2${4H^=e9oG8jcM^fQiMsN7`1(?BJN1}B4d6Dus*tzIt!Z(PFn0gYuZ zw~{gsJ0>T=`dFm^8ObK?GJ`KJfv7W1_aL7J=1LB?`_%kvNgDoO`c;`PzWFGM!@-#l z6Dq8}E8UT_iRw7B_u`{|mVNyt+O1{|Hor*#*2^s%A*mQJMQZ*#7cdHz_~M&z1*Iw# z5CDiB3FJ=*XJsEd@Bh&y4IL1FZ}Xk2+;NlK#b5aLW4PbRQdE?2gI|U7 zE4vMMqlpTVB9Zf0Tf}5!Nj=90e2kM~l^OOG;59`K{{9i3$bKDtUpE`C`L&P*sE|Hl z&-duqK}pGXt0#Pr(mX}mNRdI@Bppi~r(1C-n%IhUfS^i1R?+5d!D8H00d2A(V3>=S zI=%wkQZ}Ld?5X*Em9=O5p4-5+D#0K=Xxq$mIUYIkUDcHkE$~Jg|A(K4Lw^+vU&N?X z9uXOFd^mYriGE)lgOr4pZ^zq?2&YS9i@7ua`x3vGaI~N5!7MXNCZJp6hOPvhK%>)1 zhAF;6?ReFPFi?G(=)IJqsL2G+MS;)onT;fSYYCI8#&;9864q5&Y#3oe{AnT%4j)$= z;SNW)7)8-)GmY8N0rl9W=6F3Zrl2bW<-WPZ?(vrb{$T&zd7xVX0<)1X`RiiOU$|r> z8zC~*5;plr(VQusnfgubHvOw0=afsl%8NPw>P?6l zekSW2*cwKgAbhjEnv?kAvnjVhONa7fC9zM7n;BC}LQ{UjmIhkt8f{^ICqI*l=^bX|}%*uyZ`Ch9}- zPv!!}5hKOX7 z8fjG)y0^1OK;6S3e7SLAEukH2J-G#V$$@3#x9Q5!h-61A8J76GP~)rwBhe|Id(rVm ziBsm!`hMugY_NcpsvsewOsZt_qyoH80X$u9RX97s{iY^ zKesO0#(O$V@+ta7F^1NSUb%Xun`VG#Elkv>p`AX9lOynpSLq^!2FYH#$(?cDor}A6 zysH6I?}CC`LoY+)B2}D|QQdoXhKh?J-9Zf@A7`CJD>%O%bi2|J4=s-}i-FTjU)swC zyjlS$@PA|?-T{+&a5RweVhQ&Jr_pR~q_wTLVlmjK*cPlDbPwDa@Z0tKoSV{a z*ft|fTutK<+Cgmr&n2d3=wzda`95*JsI=EAl?)i2;2-=!HQKjmIN&#?PU^ec(~!O9 z-7Lj8-Z~J!pE=apaN-Kd%e&1*c|AcImLbYeTT1;4Uw`G`S@vu$ zdyOUJ=N`*d)>Fp43H8{pcQ7#A&&~C9yX*WBCX*Bxf*Z3~rTe-WBB^uz_~`ycD3 z(<1mF$rZUfg0@<%ySwJJY;+JqlE{lE-52l8CUgb&5(2|b%R$XwdF+;@0R18~J~du) zBN}>7!@kHuljYK33|Jv-X8+INE~1uxQ_tCCYd6dHXk3e-BAbA>F=mHMp}a`N<8X<|gMR8oY*(M3J>iK)EUM^x0S)4gsG}R8zNc$6t1(fX#Y{A8Bz&i{pm2&VE_HGSzOBZ#fzRU$JBNGyNiT1ExHv z$X2hzp9r9yDAsuI?0W8WSywn6U4q{j>Vt3OR4mll?A+2#GipJ&dZ{B0j$`B43@L9G?2nFihsrpKbg%L* zhPPaOEL*W=P~D7>M#~xuQ%Dy()Icgzx<LyRzE?E~0H@H_V^K&h@c^ z+LxBFbN-rGiLbVbU^2dacbYwJC+vE@(d`%Pz&I}{|ML+7)m*meHaa?%a?m?XPXp7c z2=Y$C7^`2eetX_-i*(7n11p4khIo(!yV@ZATeFgKj}~|9mTF)!@K3w-E0XRBR+ucU zfhySI^-tT5f;BLB9sea>n1fYSis=o2XL{i;}Y}im~ZLsYI z!cb^+=h6_11GzivV$^G-(ueNjPi%&X^-2=kpRaK%8K~m&On*%LP zwh8HP8k2=OcUPWyl_uDY)!%Vw&+{rhx-9raRx@gM0oPi< zHn|+Ge!7Qh-t#-LBZ9ZjDw5-ImL8X*d07KKd)dX@2~%~-4eiLHl>9%E8EkZ-av&i|IGUbuve@kW9VTc0?|Ig_f7_9MIpbWwLANU#?Y0!x5#CAer|MEZ?=4g77uAO5yA*8@;{a;lln$(1mNIBG@?O z5+*v1qK_0Kf7NU{VsvSANj4%T*^nSO&6L|HFeE%(fgQE5sdb+wH88h+w<2X`(OV4w z-LEWOOZQp~B}x|8#ID5c&LJ#HEAsTcel3U5ynxwEBOhp|-Z}E@(GR1Dg)E?EzZfth68k^)L2-l@38*o#_Q>a zIqoqFEg$QD(S@>|gt6CA#yIGjYQbHq${O*tS(Kqwy+qVSYGC>85t$Vk_MYG-gEH=D zl1Zm7=E_W3*#3Tlw@c(6&5MGV&ztagRcFze*c= zsX({pB-qyn(ycCb_pPs|8e1Xbu2RszMYv*sd&j!?^?K6lhoBYM41jPMMPMjY1YU6G z-8Jg|yW%GC80v$OdTSye6UtNJ=5NConAA}H(rE3rHtz}ZK#qI{@d+I)Q{3(*YRtBw z1C*%ven=RGsA~cO#(WE7S{xt!ON}|YWUX0Ew{>JA9FQl9kyuQr5<46VM6WU>Z&0gT zc}vgFsP11Ah;iE3mZq36x9JVS_|}2D06m}c2Q?E}f6>QlV!+iXZM`*rOUc3%>9?9i z(0#oMmyqmDqt46Hzltj{W05I{%mG;+e?60$2mzmJ88kg(f5`r8pF~C%>+_i83x~vC zLrc?6{!LTD*6FnAF}qe0(5s-F2bj?-p8tH1@WKs4wz~n{PmW@K2gWr>`Zs$Xb_X>b zNptZ7WOc^^*_g|}6eK{X;1ORopbheXvBeEckI27h+y`wp_98$HeUbRT@*wQ_9V+mD z_$UgOP$%=nL~W<ZSk0k)(2c(GM*y}#trqRp4t@;CJNPH#{lnWO&3^NCAu%Zu49Il40%-FxX*sR9 z$s~|k)8!X)fO<;+>K%}rj`^zSpFqLxpFja~CJ14e(13>}1ogN8$Bun5A)J4#rVp?O zHT47bw7Bw!bN+4rMr9-&f4?yb?>w^tcw;UuPPev8Dds%NE;AT-jP@jWP1Q@lyDxIX zF?}QdOx1hfPf+AY)HDWsbIv~kgn;fY;Phj-1ZbRf+XzF{uY#G9c|F8j1p58@KMY#P zZ*&%7=8+4AIhfLaAKdB5M$NBWbEJ`S&R-V@xO7=76ik2d&s!gd{Sj~$CDG4R zu;3R!P{Lp3x1ps0KP9b&Zi3MV;skd}M%!Yv^TNOKz3@y)ZLMK1<*X>C2_x2JrDlWq z-!_fnpLmzZ#_u@$TLlny(GB`^4fKEI{O4hQ45Apqel%CS6O1Vs|4?gJuM|7gcm%4d zLBPxK0V5M1p#%wdS{j5x!i&^3-hlDGEak+kdNFQrr|A;W4u}qbfTMquAfowqIHDCl zKm`*BAOdTDcbQldV2l?NwpTHtw*`_M@_Gcp1U8s@yVS@%-hhhbU~ypJ{bjFsxbwBy zt6O1V&jA(fsS7FxD!Kq((znFL161@pR{b+{qwc$bNjfPIb|tjn28tknY1L}f&j94k z4&+{s#{>Er2CYh$)bN4auYrV4K5L7=Uks+CP?G$|B}`mr0)fC1G6WlgtO2asv8b9P zP%j3cUiq!%abPSULq*Py*DII=hWCvQ>$MyR%Y0y)Kp5(OSp%z0<_}uER8qMJ_M{mW zt;^R*Qt%6t|A)nVZGV4#u{|SO~U@weZpFopYcWf53|g9~|>mjMRC7D0uZM zFtIy-kioi^bkT}J2~)f;o>kL~kyRzuIAUc$-3qiDdIPrI4e8gIKL>$!*EoMiS`0zD zt*nIj^06jpXQn-0;tb^KsB(U7^;-Li31*lKAS~%pf?2<7j36x8JraSjOgiFfGyll( zQfeMA25`T}u7%%Gcld+{9%crrq4M8UW1=mJ@joO<6pZOf7QBmL*p6B8MdKLI-0Hoe zT_S1@5wN|zAejJFv4U*wzpwiL(UJ{@cBYQkI5|5{BaU>_rjKTKg%}wb`5YYbhA#7k ztcVU(As+IP=^ZR-SV!YGD80H_w?@?6endk!HeyN5-$?V7$C^$`PZxUOgc;NUJtznU z*B}6S-*ZH`yI&wM#e{VUhopg}m+rt+9PkG;&V3`#>`?YPhC40wF)F)pqrFZLcP4RI zHg8VedsT9qr897jWMJk}jv$Qi;RpVw?+fw=Yoi#9?Mba-Q4RLJ^cw8QPpB1X&JN+S zO_mYSXB!EDg+pR)x`r!yrY+Z~#)NUZ+O!!(MQl1Mw1fEdR@v4L?e}Kc1e+Yv*dtvm zA6uFaM3Sei?!12_EdF)scF;ZM)O|DjT)(#Nsj|4fv=o9Sv+Y>g;cDYJpB9h+x!nD- z>>4AOn{$i$pgm>^1&wp=?w~w617i{+fPJQg8)kqFRNlxNA)s)(!U7E@bvu*7VXW%M zNwjoQ^ca7Bb)?rbqbG{?Ca&@!i#)7PQp)0y1P#14+5<&;L}8Ii-|R;wpDp3_X2#p( z65ZBRxLsM%BP#^?8EN1hT$g{fyghJs#l6IM&uxb7%IbH!;x!UpMEX*p%3+v$_SQ|C zjK>0xj=K9E>}zD1i9YR!UO(;z$8)rR0c*eL8rothMl+3EtR1G&{=vq^P9Ml{e4}s86^+4B#pKzIRdl>)0joJmB<6F%qteN zf2BspCgC{QAwJAKm-*`X>CrEA5l3d+%l=UPJ?%X!x9{q5~Axp?fhCD>5D{azKZgQh68_SRtfZEs=hA& z8vU0(Mth?6H+Yhe#V@jX;>okt$#TPmx&p=Po0c`qwFMViGC99bwTIZ5Rt&^ehh7oW zC|?NX$>26FpQ5v-%GrU#zP!OqzI7 zB}U~Ay+HcnA=mONE`41v%WC(7lai19#-(vwt&riDDiT;~T@M(K==0?}l*4*+#Cyx! z>a)^=6+`-Ft=eHJ#|2|v=)QR5vLoWQmb_eVjqZZ#;T{t-G-|H@=I5LO~<2RpWC(B3I^OXUE1Yh zEqCF2rFIEU@3mQnKMoaArYpC1_Kk3f2#KGtN75e2dt7DG1r8T>(piQ%a(I^%V_m)_ zrh`lp;rnm^3{vZVf&+zfv_*<$*ZShxz8F9Go}6qu$1T*CoZOS1O%er!SjeBM-^mtR z7=@p+FsC&~tuPnJ)L|b}BG6u5ex)>NC9*?rmqdTC!H0->IN@-u4OzC^?u-A}N|QQq z(|)k1k4bxzJ*%l>S}XsV>vOG+0rs?O=o`abu@J5WO>M307oUwiFJw zpSvU_*X6L3rhi!#hZ4Y}pX+aHti4n6sxZ_znhdlJDC=pr7r|@XEVEu48n0|vfIJvh z9cgisc2$?yD_gWBXNjPE3g>($6|Q4|=K9GiYZY+i=dG|>w#llxEa_hvUWF^l={15~ z;8ZeAb_m+;E~Yp(Dbxt?1QQ9Y(OP(-N~s1WVA=<=j#&slYvmV>-rXt1-U%6)#!CD2e4xa`&|aO`s0)&Hhx;JB=Jp1`R)`h4Ubf%N!o@B>p*>3~ zXb=%=kr-WC*7no7+---|y22Y}E;$P?oljO`N>EA9x3dy9+2L~V;88U22~$J%apXfn zjQRLV@3}}?De6nOTUv$N%BnD5)?o4fsb2noWRTLgxS@R((l^%jeu1T2teG@Hg_nyl zpYpj3!Ov}tN_|t`j}4zF=%m^gj#89{Il3h+_nw>guCrdg^-D0LM!>!|#+5XGA62kS zxN~`rBsHXRVXLp?*N=rx`^S9pf;?;Z?B)@}g!zOLiM&^jlwMC^$wr{*g#)8mfv~^@ zyO@$!4knB%LGe#x~j#L_|Z^5cRaG=)i%i;0^{x?%c%X~oFQnH<4(^iovqfv>au5+ zK)K{e9vUjS$0Phc3)xcg6#3LC>vpnTqUU;dPSuCSy{W-;(Ss}ko)u^!!4cM5ucdtB z?RdJ^0SuM%@Ebw9t8FEx&j_m9#u(4XIT6sFX2rH|)2Tm&c9_QmF zCdNNk(NaUpYmGM1$Uk^pJ{(zOnswmnB;@f!2oD)Os{XO;Xats=lUea(+BQdojTu4_ zmO3~6-0x2<0R9^rI;k2(9CqqV45MnCSGyHrq*e&j+M5^pG} z9bhD-W~*|y(B8`J=kodJ(KYVAR<>QUtWmyFdqry<-fCU_mFUh_txrYkTuYCom)zi+ zT5;29J`)TG7V_LL1ahv;(=-J~T<7L`^2U6#C_N0vXYMwQu<4DeKbbD4h{MsoQ(41g z<;|{Plc(`q)}*z1Y#xG(NqQ{tm>kCOgQF{f*S?>9{?iDybKmmqrBZ&*iFT%Y#7 zBQZ76bZ`%Hq7;veoW8u#au}f5b?%!N>KM+KAU&H;=5{?l$D!On%}i$Tso$s!z>T*2 zQXu*SYRxZV{A2gcPZ$Da0SItUkvh+tbN+AefPZX8nJh(pC|MXZA!4|kSl#3dtv#%H zVK&a|zSOCAvlY4(*^-%zpW}(z7r_SBT(-w~rHf$m)^2*O^ZhD=)aB6vAGHpCWAbnv z)YUG95IHR^6%m_0H~Pys?9Hmz>r@7F8Ktx;%Umb&CasbaC=ztJ=NzVV+~T z6D{(Rf>xai+(qzbKMp!3Ip&F`mfpRbY-_bm0NP3+OBNRFhcyp`V3kLQeOu=_4-wIjQO|;nzM?eQ37Wj448ZLdK5T#9~Mho4dFsN zW~qdRrb@dCXPQ%hdXRqEAP!r`o{fQWKv8(|qn*SCuU`NhJ$x%^J ziIwhNS`PP?&mr%-zL2zi2I~h88iIjG@8f(H7D1&MBO6?&XJ)?U9|;MqcH@m1lwNo~ zeEwQ-b#jGH3Z{_eyi(K!@~55|3L&_JLrNxk5uWf03g;#BF0N(NuE+dE`_-ch1}_O5 zgS%GBr0zSuP#W&1p_2-P-QTSYg?$T)qMiK0pM5+!>!aukHAYvC@+m9X*N%4HU0GC6 zvWcd7^mfV;*ZGCent< zP$BPbO{BUi+|A8<=Xz)9#F%-!v}A{#(UFLz_f-RUBW0b6#qGVN!8Kw(Ck|eUuxMxO@IylTrHQ-iUu=j=#I}gGH|5t3>(} z0qUDy^ObLu%=o(Q*jfGxu`(u#Wbl@K(Rc-9&OBREF5D#D$hspvvZr_9%Iy;d$+=sR zL;6*)C!Q$$PHT^q(dk{XDmQ=KopxgeuNY;AfPrz`tY)UM$})WE=MBlc)**M^yMn@#*!EUFV&P|Ha1{2ZSzC?WV#ud((L+CY!=jPf?TvQ*<~>58 zW%3~I#ZjgGPwj%uE@`yQ*NCvnqqV?Rw~{^7PGb5o!c@1#A_&g-=JAdT66?ZC7c!c< zwUs@+>b9gey1iv<#7i8n-^Y$y&w1x#6U;j!Z7t?;Zi4M=1tfB?xT4>U-i^ZNdpBFB zepqng%Sj)NxNC^;^zyc!7;$~Y;1WaS)sP%onUY;wjBqHd9UgL3o}X3BL~dEPBL72W zB56g4n8P+y(ZY@*&6B&kS$^)hLBG@=PT=Ss_3}w918R0w_l#~XSs6K z(|yoSTA7g>KMVJUJ`2MK0`h|(?ZaUGo16l;KaL989?=C49rx=E~tbYyV7SYAAywy71SIw_dZoEb8seEt~ zgwZ^IPu0$#8|}s`i@1bWDQS9?G2}*1aOCZ@;M7ppR=bB~xEzf%skF~;^=MdGwO2RZ z%a0=-Hf}PNX^W(V%R;x2m&>GmJc88yJfyh{jr; zemBrg!f@12=Z9_e++hMGEoHH-C;UOOHR)`nUb3xvp4}m>t||q6z8wvcyx>!BPIX%U z!~qdX8pPE(W)Oxaw~}9HlJO=MMI}|wZfNCQQLk}T$yZ+2F;N?|-Q>fES(RmDX;&rY zWwhpmrV4vT=0R1<3M8GEH?K!g-qXM3Co5-D=Z&knH@9N;s$cul{$STD-c<=6(!52x zM&QkkHsD8r^yUt9p^8WMx(OKc16LB~=!8T))m$HGnr#+GZ7|11N3D>__N7X(^x{1# zE_lhtm+K|Q#ARR;c~~jPH(a9b44GLb?AOnJ#yt~$^C-u#R*HoZziqKkP({~&)xx!< zDWnYE!Sm)IN8ZjT0Qr>()QSYvuP)76vuZN>l3U2iSRrBgy8ZnNcY23^QMASniMW@@ zxs7Pr8cTJMSz^h%km@a~y`Ptda;O=KmQc?rEEDQu&emL@-Ri%i`{fhs+ko^EapF~d zt?|K_AR{85UVh}4Vj!Uy} z+y4#s70p7J4+C3Rw$D)O$F3eQ#g8mXWfpNP4Oi$HFO=W^bh~(R*=GC^T}B;+_&{C` z9v}wiwH~up;8V z3nL3Rx*3ZSYgNz+h`Z&Ro76v;Q=}b=TFBSmI!t-R8;QhSRxS z#}SZB_l)@Ag9Dzg?}~J9oa?J=Iv8J$M8_-N3@H7es#KoU%G^h(yIz>lck{d_Wa0{4 zzGDI@WrEh_uirEqB0BWS(6f2=s;UuXIUC|=bY$OaLGD{ig717CxfWf=rOFZvPCh>r zc(f_4d61 zTTuo&rOZsFo&phE#o}a+P|cMqI4x+$ks5`}4jtC)p6@8f?k^U#9#Y_P9gOxRCI&_5 zC{ZLsFD|VuM|U2|;L zn3U}^Z~-w5!opSmsT%CsaYP5&4RT^IHRwLgWQ(pLv`2>BM631a⪙~lKwWjjh{Hj zUM6ucd@m=Q%m@~Qli0sTSm@f2Y&g(y*TS?s=aKK1T8p-F(So5R&2fZbn=lM%8ib*R ze@Ks81Tk)Lwx3)5?M0CvH2BGruGi~3Oc0My+am@>)Jy#OaW@gqehDURSD;-_2o=iBsZ3Z2*7zx4V>=MMFR!-c zrMh`c^XP|8+xsDpZbTQ!?<<9m0+>3r#P0`380^%1gs9_et~) zYCfb1d>r7W+#c)&Ag{^ihFmaft$EoHSbDn)M$?dIGZ6`KB&F7;|2U#>A7AMR`k z+Gw_!TG?TB$~zvFVutqQ8->GbwDs8k_3{tFog!&jIy#2C3G&{azR%~1B9GoGz5P}& zhA8(Ww?zn=S%ohWa8HHGtHxMJXOx;MoTGE+&5wdZ7>Tt(P~>S;Y~acbXoZ^Y!S*!< z%CyeaXnZCIgUF*dUKz)vD_3CpuiJmt-MlzeCqEwBov)*=qWjT4y1t#)}2xBDs!N~K%vL_2m4|*8{*4%QV+P60f zma@0?;wiZx$sPkCrK&&mQ@8U$3{R~;C$ESO!B2Ttt#{P)RK#P1*pJe^ceTEkWBZUR>7P+-SR8>4j(eJWW6rjYX6mF zpor@&eB*M;(*sw=4wvY2v%1FJN@FN${FsF$VWues10D}nMh>Un2!?I>jUMNV*6P>x zoeEg06hr!&_ay8YFBH>Wq?)CIHTq=HGyCsx?H z;??zkMQG7&ifH8yWOnG=<@&KzFdgS+JsZhaV(yD^2#w#ZZyVEqG!BG(DK+A1%Jng< zT&I1%jc_m)V{%?UTH!c1hM1+O-QfF&j+KK;j~2OoODSt{H+~`JM(b8t9dUe#QP)uTK%A+-t8-LayUUO?4=7Q zMn{X(o57zx z)rdX3f2?O`t)BL7V#;8f$TOnYT)aolC0VL@dWn|{9@N4TV+un;OAob07k+S+6m6sx zsCfHD-Q*f;Ybj99AjYXvxi~nQ?+#&Gd2Z+6&t&S2OcA5&tf4*{eQlvQV7~G|@N%@ttmdL_(iX5oy9%?q5E5eyM{uaw6OUml!|I3P zI3(LaSau~=Xi;i@z9$v^Wj^0QVl8EZ^6rz99IdpS!Y)q@_V$bc{_ZaHY&29Q-Ud!z-=dVs|7ggo`XcZLUYL=amDJ>4R{JqUM zrE9Qnpky;&$IE@NVF2y6bFJ9YjC|Es=xR8sRt6dVWilq|qLb=Qsl&NePIA3i((z>q zx4coK(8clBU80jQaQFw_ZHU%K#(zSKz=y~(s_Q*TLzVB1=(6TDecvgtUPukxG44-T zC{iNpx`xQM6y5%*bf-sG|iMUz4*DJ3KgF|UwG+;Ag?*`H-qMBsO9Re9DZwH^^kdOwZMm6xCohh430bhO^<|R zp))toR(aV*dyj%_4Er}_Yur>fp6ZedW5X?0^LA{OZfFa}Ezb0g+ezp|4#E)IjY#8iFL6!dYnPPG>H5wdqm@gWns`c%VqIHi zIY#^0v2ir8UX3GoDl0d@4wHJi_p5vthD8BPugmo89$IRy<(a_%%A;4 zSkJ?t?v7^DqFWojlt(`s+gMM_i-9ohw(gG&Vt5$uFCXcCFxReg&ZrPS(J^$5*6fJ7 zlE-gf@;c7+CK+Lpcazd^diRGFD#It!Jh@uy#FCW|s`nI5(?d#fWE_#X)@DI6ITa-V zq5U#ftc)03ZEk`u)ajD+=bD;UWN~p$ujBd4;sLKe_{`WC)h1~WETQ{JmK}RxsqD5$ zhdxd2emSI}Zdb`F@!V+W_!HSBHgUtrl#zj@90j+n%nv$I6(1q2d!s8UdgExfLvBqn zS+I=w1>>)mj`_21MYja6RQEXW4(;~0tA7xw?WlA(SRf>*SPi4Mk2+Zn5gR5NHO+Pj zh6QM*9MFUyOC;%XgW^Hgj2(Zt!JLPFt8pzec+9X0ty=6>pd)v##tfFV{hZjw*9 zyF(jindY5V<6N;!8H-<=TmprfZ(;rL)=hC&)wKW9-gkvXl`Y|##*7FeIf+P6k|LmF zktPaAk_-xnk_E|8L?nk65y?W6bC4Vr5y?sx1Oz1KoZ;4X&diy6ru!Tp?%N&a`{tqD zyY>pTs@AIY*I&&jqYWF&-9jB3as`th>I)GEF5Mxn5qmqdmi`QK3JJ_BgM$)lWlvdb zMHy_J8%)u1TU1a|VP?KAs7Tg;pJo(urxL>2_Gh9qHkUh_Hl@3&bLt6mO~WQt{NER*Ffb*tN_k)01ZSVpgdkc(sqMfa>G0-Xf3Lk9!yKiBMis`0TOhG#<8_ zqJ2L8`-}6Pch{pd_TxpuSI6z zoefCyN$lHvV_lwh+&q(gCQVMxjmtbbqb^GqRdZ^GA;UqZ@B26Q=kkz^xFWSRyPIJf zKDN`wcBwFZckf+OVn@gKz^)=%VzhLNkus@}MP7W-*@nlH4qNzU+=0*Os~P-~ft0$` z;W9VGIPBMZM<@M^#S92b7fD}uMtV45kD# z%0i?@6$Fcv#j;n{k{5(|de(X~3g>IS=*^qgZpbRPrQIU4oHFus?%J_%Hr|SJX^QPC zOrS~*c8tx{^TE@VDd>SQ%1ttwoYhiuaBJ>*)s&UxJzLliP{&Yf>sYB3MbbfK`9R`j zZHpP9Zpdc;;B)Is5X^GvatJvUo&ILkYv#Gva+xtI&`SA^(RM<&VDxtjHJcB&?=Fd9 zPKvKc1eo^@Yqjmp&gzaV#{7A4tPh!-9NEN^pVYNwada|&; zDvJ7oNzk1vCQj{XUy+^*N-7JvGa1i$lqK-x`w5j*1@gF?v8!GFW zrYX-9MRxK}?I`6ZGLdAO+)ZnBxsWa4HaIsclI@dZFjpKF)iASFY_+zWek=5WTiRs6 zqR;1Jd=vS!0%pWTx`CNJ*G{ZdAFw1oxgzAKClJVFoW6l(iFC(Y!KbTB?$)-Py9pZ2N$th-kegOcf?(9fRf$6ryi z&!gs=JiK;aG_|;S)11V?TlcrA7%pjf8+A9!H3P}yd&+M5cJ03@NA+rT6tjAl#xp84 zQ4kkQ3{krTC?N?za%1UdzuL=ZNj8tzWNYQ6_LsX{j^fM-Zyt}9nR8x$zuWbri1jixAQ#{*6o{h8n66->lj@AUVgIIFSThRHrP%zUB*17zgUZbw5L=hK}o3ddi;_4xs%n}by|wo%pBf; z*DB25*5Bodf1c!HyeSXjOwEYdJ9h*vuGK z6*=GuKb8b_2N(t$&ZB}?Gnt_^T>ByWW}llg+|+K}R%SqcK2Li^N4%v1u3)-@pCC6Dqi;W0 z_>0*bv}g=$krHyv-W6>S@Hkw97^Kyvlng+d zg8+SM!MH5|x!o6r59{98FR2Ky;Q#<)L{%s#0@{-vvUrp8xI*CO4luQ6CYU-vfEwUJ zA^EnAu$Ig6U{3n$c%a?8B?`AX65=V)$OcHQ4(%9#&sGruK9+g;1uwv>5thy+A z6_)yl_hIP$i^Fgu^&42bMchZFb z;~$50z0r~p0Glg^_HD8$+(1&QpnW-A`2dl(0EqnXEKNSZzha?%wNdjxYCihXkHX92 zFk=&7V7~$Gu4qeq1`;BZ0TcQ;pvwfXxh%9-0*f9i`R{WZIsWqkQ&yU?>l5Aw%i&;1 zE#uII_I-%L7(x3~pnYZo#@7M;ewLInY;K^$f^ZLkB>K-3J1;>hFWk=5JzpCz^(or8 zkxS>iVR)^Xudwa@PaL%2Uz*hTox&?2stn-gkRlJDwc2)DXvxuZNbpvvQbmB+m}4h> z8_%65>uNziOXbvKa2Ws;6-LL%VPMr!Qj94niZ3|D;K1GV~rM z$18Ost(k}?1pQEjX)KGZ=u`lBFHECcG6ExU^Ei6ar;+9x%OXWn+qoJ=T4!eehj)JP zn|&)n)1yR)>79A$`MFx_(w`kX%O~X{Dh<#I0KorSOE||5>xLy|-GR&U?{+Yv3)55x z!SN-4u!dZP^kCud5=gSiLXf!AEqWzR$4Bvs3%WD@wsQ|0^V=@UMFXSJi--1$KP1t{ z#Tz%6_@#ZG$qii;D!HW5e?9j%1P0{$Z?%5@O}KHysr?j;4$wC}ioT_i!UXVr!c$N8 zdH#U~09|YE?U8y?bro@TX0L`MSJhPQZ%&B|^Gc$FmnG4Vc1*89SZGeg=6Z3&UBjOG zf>6N#vLZ?TqLBtwcpo%n#b*%Yt`%I|O)Q6mgVNH%^xufeNI~=)cH(CDnSGvjgaz7M z3T711eS!H#Pt@vkMsj6dJc>c`cq4vSUDVsaafGI?5opfQF!Lv7>=4cT0FlGT(FsRQB`rw}dvlX`7vm++lEv1T?UU)jLv2NZ zOfG3K`8)wXJWr%?yJR`4ANxsBaPU7maI#rc2I;^D=zzEN>q+RqN$3FCPI?G*KoUC8 zoJSOEfqIXzpT?M&JLbtM&_6JYX0B6UR-UEbnGKrHmXJt_*IA7Y9pynfA^m@M!dd}A z2_8xWxSL#TXgfg2Yv69wcghkEI{Gi5+^0Rl24}qCrDy!Sf?CgR#LT zOPiq-bNzu4p*oG)=_x`MX=)=su>p*<%$DDdxgwb(btjA6UiW#Y+yFHoErmtCO057? zFvQHT$a8zSAahXu#7XbQbLMElgf9*Sm`XPR?_h&!t_yEtt{K`m@pN>K~!#yJp)iOGKfNFijcp}6CmvWEettyX~9ZVs$?a3iiZ|kMc~KONZ9au2U$Lyy(m> zlKS$(;$Y4?$8XS{%b;j_(`T&#(7yl+b_iAu*ge3s2WCnud4X8D=)V}gXA=KA|6m@V z_KTm$_QRab?)%jw2MZdv^@&9VSQVnQ$n`iqVEZHV3sbHF&IU*(#`>#!#Gh0xgTt6u z2_px3jD7oo?eSX~|t2Pd@ukuTB)EljW_fd=2y_sKNA|BE>SwjQ-GBT1yYEt-F9*qY`VW_V3FHv$?PvD0fKk|Fe_<7 z`Og8<2@}dWA<+bLE(WgCk5^IgCtC4O?+aFnvHv09H8{cZJ!lUL7kL6pQ^KW$q zZPsZ2LJh434Q`2};DVdgV(8%GDfyXm$PF0X((=~r6VSfczZh+B?a$rBgW=Z-SOPnO z?Faq?lmo7PPPpSISne;b{c=k;Mj0%JCbZA)NU9*hKmjxLHA@{d@FiqUF|S3JfCg&( z#SVdMH>nPM4tKEk!SvLZC(S|!?|=Z)E+bN)(Ex%K#$M}-kW?J`Z`ZyIv@#pw*!j~S zhw9Ncvbl_a-QRazY2_cwau`t`qhF9Mjmslu3Y^Y=zWO=`djFHh$ACYb%O8{++hbP0 zjq-HN0l|=A^z6fITuh*ikn^Gc@c2C!1y1Tt$iT-gtmMx!Dsr}0RdEy>0Ac`ytjGh% zipcPdWpHs&X}N94tiVR;VR#ZBISz(-P&Q&fpoELOj%oMv%ufW1gso+6nMkb|0r+TB z8xE-c&rg6db;g-HEkcI*9*#_Vs{LC6M1H+LvJ7 z;i^Z%p$A$T%lvFZvdK=>7;9;FXvO$Xo~U5Dn{E7kU+y9hENt!nN4@?vEWfMQ+HSmr z%;;}81`A1IH>=Q(1HLa8?wW<9W~@UB8^^l`&d9h~rLltR!?k{nrk@!%AYmxc*O5ef zTPAbk{GxFKl|CdLHr9I2+3% z0+xfIuZ)4!0NLtbmEeW!c5xM|q0K%D{XkhQi6}UQ0~QUdkWNUr!%309|9~&>PpXOX zWJ&s}foI1mffzjCFLWb_Csm!2bWD5^k`SU}I;)Ne6jol|Fm4*k>byT;*rEtUw@l^!3CS5)xR! zMGk+uJFSWYpGI(VwHLVSJIO*fE{L=e`!g{wZgyOn2r55i<^hLiY)cWWDZGf#aBk1^ z`YtU2?c!Bmj=Zk(Qa(WPU1}zWb6;_ETH|{!l!F9b&0Ze!(I^r!Y1pCgRv#%c{EZSRGSh(~L;8kei+6|rVuB=*n@IcOW+oIndokNdm z9;gT*!dAiL=Qjf`{NJSlqAH#F8rVsI_fep8-V#RU{|N-0V=d9;_B)$H+fFX{P*!YxodvrQ2j8^}%%uoL(24=bI){*G#TRiB)oJ=EfoP|(k!nY+N8+kNI%HNdQ~Ox9ig13-)VWBHxC z_<-{mNEcuLQq_d`Pj+Hu;aJ~zQl$&qu+`TmQep%l@o)NA;vjaL$3cx8E(mloATbc@ zm=K@)c^36@6O87M$Mvxs@Ri7a*v;ziz7)La<b%Xoiq?J-?*Kj>a(ALA-cnx zc8ftqIGk=RkTR>?QTWj;v@;pMJqIPV#;jTW4;l#E#(7#hIX2I@9Czt5^$3BhTUk)Y2>Exx_`4Y6x$+%Fl%yW85G-e#oPzL{jb zW;dAj3O~Ze)0lemN7_>5shTcL&LoL@JMJ~CbB4Uj8xs|=)^7BipO@fm=Te`DZyn3` zq-(=2Rfk!p{q}CYV|EpjpfzX2+hemj`PnWc)uLD+(jb0E!RdU*E0qnqjq^3mTyv%@ zCTW9Fqk1u}y^lF7xaP2{{uAM|`{m;cFcPk7IRf{@R99@P(V4q;Go&rcF*`*s+3sY7 zO3m=DYTa*ACEA$W=J9NZy|DSAqM#q$t###5rnSy;UVX%KcgC$DXqkVebp-OX#kCa2 z`*ABq3mA@gOwZz^JiW8?&OMS-854PPmy(L5J_g|`RCk*QM((;#ykkIUZcXXT*x@TR z3uM(KJ>m8Cx~^k;V*Ahc^%AgUe()Xd+iliqFuoHfv0U9@9$r~|hwXUSZYGmax^@zW z;P9N#N%2Y%6*;rqxgqghhlWckzBC>*${O0NKKi>ItNkVJ$4IV5m5{sH;aZoS$GzzC zDN%7skFNC#y;peQFxMVpSyj~M%yRKyZA+g73ER?q%K$aA;N8^E96juzoqpnI7zc;V zHjfb=z4Ic1lhiDS{Te+w7q7keSRExq67w!Za$r4$Ynn#>g?GKo#g3xNVG-AJ{psqQ zAw30Xx}7MRmo0hB{7nnJD!f7$X$-3ue8ZSJa5nJHiJuM3jC9~Jp-OpPPLppFUF@`D za9JtFr4-ci zp%d6s)P6J`lSVClmQ`p`%TPPGf$E;a_11i&#NZ0{ILtsIv@fX|V>0C(HEQ>^Y(exI zeS)RYh}m29MiFf51=w}oJYiC3kf$H#JA10pYe?c^NKannNWWi~=75)LWz+v}y%LBh;yFFoA;MK7N8T+?L>?aIKhV?@&lu(7`cd zw^8uv5hg;vK+Rb@SNG%DSs8^dC8hMvU)gye6hNJpfAQK?Dj^=O8HFQVy_etGttXM)!^<|Yx((mnC+G(%qP2` z*_mYCrR_|;feZ(`u3n0ZRB(BQO@*d|U@i8_(=!SBOg4HWt|v6@U1nk>mFZjwrFFo1 z)MfBU_2G`;;eiF#TI8zvA}ZImJ`J|H=P=5c5=npLJSd=3KQ;HzvK zwUJP|^KgQe;`o`c$l_PW*gV^kYU3mZ7j#G}clG!Bb;-TCyD2hDl!=&MmJg=&&4@=t zNWDw6Fkkvokf*{Eg(ql}Xn}{rE78h>x;2Rt=#1mo%ENu~Ia(htI_2opSIHdX`q-Tt z?YRSSUQBd2%b1ED^`eWDpfg*3CMuhRja+e7Db*B#pO1Gj_#KiC6fH$Um;1ld>*?939lzG z6kc@L6a@A=v;@dlz3`$CIEQXC#2$yJV)(`JlzIP#274evSmY24g(%JwjrVC7E41jh z@nhbRvN)<4asTh}w9k#fu2}n!#o^A^91@G zX62x7ca7rEnKNhF6cv3>oM_nCu3uc7yOf0?Dk~`8?lSIe6XR1?)7EYtBOxKyhO!zF zn)pI+6NA;29I_Fi<>R(lYmS2Y?wZxG)zU^nP0iP^um{E8+@WH9LYwW}H?lHo zD6^iLjpAEtoj9^KC)6-v@W9{|F-hpshAA1jyds7D zQeexb^Zonx?XOq9y~@M41;;?fK$;_ZdKe7mCKT|Xrlc&hZuoB>>2-I92Ll2E4(sX3 zQ46_1g@td^CN734y9`7d&i50Z$9;N_)ZK2+)13WM?fI^|yWGkFx~UI;cHOuoySSmz zU%q^CoUr2HUL7B-hW>+)_uFMmx$c227iBKxzAd^Y#KQx{D?oDHa4nSgB5g6qyAg!< z^sY%GZoy)1tvvI?U#H!V;79YjY_o($IBZzDm}M?cvz$k1E0TmohdV$g@-*3a08k*m zy|g^>4B>gh>Q1x$51cs@eCAAXT^%&gcdR8TDRHvYtG#|W&qas%$E$~BNbkQv*}cZb z#^vSZ_LsOt-8{hA)|SV~`8Y4{#L|*AzgWByosq6Ax;0i*i@x&hOC5D}bxlo8)p^{S zsC3uRB4@d2+fq;`RIbVAil`|Htw5rtrp@M0uGTEM)@r+NABB#j=nbW@a;*IU9e0Z& zW^(4#89>DsP{*A02w#sFzk+k;deo#;c$;sd#8v0t}Pp#5*%`R>BaxA3EG`YXi~?7jX6m=u2YE1n z8wnTAWMq{Ups$>fi^;4c#C4qmdC08uKfYi*HjrhuIZKy+MM3|*{Z|fjR7H|0LYs$e z-jzr`8Nfu3j*U*1`nHP_!2pA;NYgnS?EAOl<{!h%$3Fj4)p9FJB{D?O+pRBfI-;Xx z%x>ZkZ)zAxOiCmk9b1x?zSPi=W$NPJRd8OO-cGSFbO*at-{;mtaP;FxM6Y2bx5iJ! zeUg@QClzZbs@+ zV3W`uP&Gb2ZpR%F5g8d4t*xo5+B`O8p6$gnv;Z0S*oQ=z7G1!20?VeNDMX zSDBR#C#=V{d^oPWB+{RNSz)Cocc$~9UnObu#=ch(p82_hNU_G%CpyF<(i8AF%ved_ z>WI12ulfJo#8eE9Y5rF`a4OKBI~n7oo4h!>Y^R8902K7(k>CWX_iKdfYVwtv&RQ`p zeH2^k2?a_UavJ8RHD(2{PdWW9q!&iXXHIHgvwtXeF@*QqN>bdX`Hr@j@_`_!gdX-9 zSt%mP?!ESYrF#WfH`hiwhzQ-C?@ca_E=$Yk#h?Ho5)@mZ=S>k&bkCkn;CZYCz^bN5 zkbQ+iAf0)^ayK9L=C%9%e$HwXZV6@eT%Eay_pG~o?sh}ySSG%*H(A)OZ&VD8sY}xQ z(G;cO{!x65iVBWwhWwT1!^{Czczy$A>nn^~Yut$na2h|NQR*TYdWX8~(W0t_E#BN! znysOsy9^as4~4}v%ng#~ZfO`cfs}(WP91rpIVuzaB}OxgZkg4COo8149_LwhQeWCE z@AeOQCn%(7$ zYGPI4v2dwcrL;GYJG7=>$YC;R_g2&Pm8{~=yEyi^#)m}F;em8JeYMBiOwzaV#>T(y zcCT&SD4A>D%_d7%w_mFr(56XACtGnJ4GiEWv6PsW8y?Kuja}(ATu*k)UhHrsIwojp zZtmk;v|f-?JgiWhPGpwv|MWx8NQ4;rmTQ-|atljGyGwC9eO$1*HGhOSdY4A9x6o*1 zQHVUEx5eXf;-k(l8{CE47@tRq;v1x$+eRA*QU}YLX%GMXFymZ|nIu_Dn41$?Uv;>q zq&r_)-BEhOqRFl!xgS4{H#^HvYg{5hVJ^Ca{C)@%p=gZSPGZhTZSP<`@P&&t@c7ap zK@!=S@)lPnKef8}tUXTNuZz>?JLHUlnR2h2q^4kar1&(V%u`2d^WCN2x23SXkkz!2XjEUO!8;(HzZHxnc25XrAa|Ngr6NDIGC|*pth*PKaz6g`InRnSanO zD7oD?Bkogvb7e@NRTHz{-sX(kqFmg&%OOpfCC4X@3b>r?22XkubCIOlBBLnFso`yYcQ7`zbaPWEfP)~+0us=vfUc;9`#d$)u5 zc}PKP#qNA0T@9;Jz&UOv`b_~ze9cO$%&CTghuX`n6}meZ1$NyNY5qiH*E}EhiTvgsRx>2*4^c%>lvDxw)cPo;3CaYut8qcM zO;eX9FR9=Y!m_ZQ(A_BtT9KBjy<=^^qIdM8C!6hV`SnJ@dp5`R0#*X0%Dq_AiBa4= z@`K%-F^h*I`~}2JI%bE~`*C<5)?+`v7>B0I7>l#7OcO;{C=NbO(>9EkD0`zccWCOw z4X0!^j3()y6z;B{x^#Hnq4}>nd+E*HJ`_ei?G3|pfk>e$gOgXgH zkGrgI%}ej9ge(5Y5<|C7VtksI7vZ>;#~?`sA77=s@TGlI(e)uhcYf&fs`U3KN@;YZ z&iw3Rzdc#ER=;Otw3BjeX=A1#>C~NOuGPCa^<~Q=a-08%NrXR= z$gIfJk8URPX(M~z49yJdbe6@On_R_Yij9&xNrlmMRZYhHw2A6hStmS+&U)Ap9s78M z&1I){e?G%m(Ubk4CynX*rT`A*M+z(F68I!FqiMlt9?2=sjb;^2R}omiLi zKDY3n!5Niu0u);1Y#!hhu?cH`hQyi5CjFUw%2Hpi;Mi?5Ae-KJ2D1;W4=QtzN@Raq z0PxGY?=QxiDoOF3Be9<|CWU`(=d68Xe;YG)m9Nr)wFu4qp8>qR8oTjftUaNHs#q|< zzYvP#J^EVK4SQF1<9;oIx5NJNCJl}$%J90|NtcG$kR1@Kngi$bgErIJ2Fp<3?6jL6X1C z+uX7;IC8SFl&!b8PJXYjIF3W&_Zh-Z0sejEk00pk(^DJ@j{77gpr`RlTwdS6!C`2k zt#C@r;w{d1&m!+`tyRvQJcY!msHmt~3?2suLl8yGXZriTun{L0LkkNVgHbiMwePz- zjuXp$e71+p`^eX43J1*kmvM(tB<@0V4W2A<9BVGO+I+XO)nZu;F*pbjfKZL+V(sl> z{ZLBWMMn*e{>|*HO-;=ecDvgUg@X7N9ECskl@=a3a>TrE@xOfJ`|9`a@E}Cm;HBW& zKplQpK_0#c9re`8V<-@v63dj!0q_~F6_!xP<@tLY+&1h{)X+^b_@hQjBx2XC0-d}1 z`c_uU+PqYL{sIo`^J;870$lkVLhLXs!+BC|J}j(k%`Htb{>?JpwjKA@_jm3PC4_KdqIN}OOvEJtz6J?&S33%HLYv=pmnF18Mpjmq zXpi5ScMunn?x+c->cfX0A4JV87h!@RTrrW7vdKo5zqp=0pmMzU0y#UGw7uVUq6#En4)F?qtr%~`Bu`Su&N>qM3ZkA6%PqrZnh+R? z8`W7=bxjs$&dSJ^t`Wt&xRdf4^6~AcJFh~eF=DTl?2(#QZi~E zpkf(z8A?v#@V78sy==;t=|ziUI=T#~+tDVh_E8Bx`VrOmw*3 z7lud#T2yniB+yY7srG^GITL2&+6CLcARYZf0_*VWv)Ft7M7R&mN^v+5naE8BJwa&k z>~QoM?4BJd9QF$m>>I8bATTg x!$0cqm$m(mI{c#!|ER;ChWr18v-rNZ?_IJVUXWL=`#$)SM9GR~irjtre*n3g-NXO@ diff --git a/profiler/advisor/img/overall_0.png b/profiler/advisor/img/overall_0.png new file mode 100644 index 0000000000000000000000000000000000000000..f74cf2dcf131f36df9901e20ea327d509c6fee67 GIT binary patch literal 56377 zcmeFZc{tSVA3yp$tx8nZvR9T$sD!e&ptA2Wwk#3GIt&K$Xdwxe5JE_{5Ms>O%9?#& z$5^wDnXwPXa&GGR{+?=n-{)M{Ip_La=bT^H{4tr&=llNllc1O|boH-4 z{@k^P<_M4cPhX-KR!}OPidMhdnB`{Mk%Krn9lc$g$D0nJGfIq$BqdGy zU02Fg^xjNV%OaAJMyqhfrY4QI!wPdS_a+Bzws#c!jnagpw?ygtq)&5N{#=IqSnqwJW7fE_M!70s#(cQ1s ztW;g=t5e7vTu~)b*UhKHtn0`2U0rBG4)}>Tacn{_4@8?1(Ne*fJ9Xm4a36z|f-Fa3 zS28@$FnrcpmR2crTX(%zuXAr zjh9`S8P4fb!ZrQnW^cNlSsrH0I2_Q5)vxR+3RTFoC9#z)2pK9eVoaZ;cdXx5=TFz! zQ?}NOlT$`!Gw6!a7@AI%B1~t`Rm?62KgSLuYXas7sg@OK>F2*Il7{I|Gm{8)c*tXe zUT|edk?rMpd{y1uxS^WqtG$){l3YA)(<{ygasBeB*mOikSOfv5k6e4)QX(ULKl6B( zKEfS(T5SdDfF#eTi)2(E=P&CiNkFg2T&0*22iA#|%%~ECb9@}3uekIKO>|dTj|_UH zlezbz#3#z|$N1@W+yr9n3#?6c!YTM!b}&56O%`fB)ea#a-qB^WP=y9Og~pZ z^ZB7)G{pyK>a=vMjR;I+u^`X*F2@XHO&BJ!KER%5bf21ZcXL-7d78nDyXUCV#N{hK zy%;eFqf^&ivs&)=vmPCOobU`p7zU*xq9+Fsj!R)&dL+GK&o%FL=w zsd47<2ZMB+{hb3|NGlhIg@;hARFO#2xizHK8>+gja6}b{^2R8}O!x63-@5h6&MQd690V#c#uXvj^M-(h2TyN=bTl+?=U}X`w zlXgv@Y^JaDLh)_`S362p(o#g=X?>Am86E9~=03QHGo>&phuD`h5S6ROm2pK=->8O9OhXa#3)Qi!K4oEgjiWQ-TkbvWUy`OcPg z^P`O=OQX~X1-)0b5sxh(i3BHp{9ryc_xMO$+9G+m63!f$$QrIC-5FZ4Cqkx&MUZam z%=GM1(SEH{Th$#m$uE-P!@1HZ;Z;JaJ||#dUE!GAHfz0=lD$LJL9}9d=Qup)oJ*1VH|8!ONsaCMj`!rJvJWow70uY9-Sme;!yu`7Ae>Q{Cg zw;3}HZ(@LDzsBm8@f+yg&Z}+TSVd4>M)dhsh$U#_NabB0Z1tr6hl-Jif!v#XhFMC zZuC|}e&mf~eVhuOQV(Wa#g2lac-j7g>U0g-8Li(M`YJ=p|Yd#r5g|D z(rLw=<2o3U1A|V=O3$715qg8uPYHN1^db`RQgBxd>$MTftIrP|XH-a0KEvd&-lJ@` z`Y^LDWqy=ZM5*23VZ^dbjncgg5tPF2GY4$PM)!XFyA`QS&Ay6h*7 zKJmh0r}tjPFo%aYG$3QsrGGe8J|2wmVsA$Z$4+!=`MN6*i4_?2Z`&e`dyPJK*dPvD ziZ&C17Fu|0Fi4-v0-u;9m=|2fd7(wY&wZUz{Qwrq(;j{?XHFBpieK9^nDItR%!7Q=esg zM7#irmDQC`ZtEFb=6Hb!=VZ+-o7JWF;~l9q4;lj!(T_%P2q{Su`_OAc<27+Y(*#1? zlMEe+1<0`pkuM+e$n-F*T4PHBHoc0H z!?S0L4VQF8th`jGo5EyflrkD>yIK_>F12Y*pfwBGn(0IcZANaf?|09cLmZ}rGA70* zF?MTrRxu8Cfu+wqJk%GvE;)OvgFPj*Li?j7q#-lyGPqIA_%6O-C9BYQ&K978@&b>u5dc13r=>-Dcb0hhrgHIgT^C!IdPwlv8vAqm;Z=dCeDv@-V zMQM7!P}cy>$L$|8+a@nax~=mrW);-CT-lP)bHhY}YY{zz~}Bmv7ILpD{LLTA}sJr@{}a z_@brMR@)Cu1(!3V>D>*~XJo#~1oJ?=3f$L7AnDyZE?`m9L1TInlX(x_S0aO9;u?_g zZoiW9xbo@ep!tzXW_^ouojyw`M!V){OwB(&PLh}zV=C|O8UDsOojBK5oG|K?$*er0 zENf+w?N&P$4)I02&7HhGy-PiILeac4WGDyc6OLcB9gH8rOKga{*g4*DmSHr#!9 ziUs%u{}ACRnkNpC>qb}_<4KF-JEnW{VC7bE=*a6Tb;AO<0Foqi;eu%FzUo%X>193V zkai9}uUbi_cmakA*tpiZ9I2}8Mxcrpm)(e>?UbATXfzXYf0<%eM9+!i8vGf~yqod9 z*UlIv!qc{loEqK4QwYcpg0!)ev6btp$&^?0B>C z?MrB=L;F$aVAgqU`gEeJ?&0)vrsU4c7*%E_(%@QIVDj0KVOH3iClnaE6?(BSdr4}l z^Awsl>e%_t)fom6+|7+R-|5fsumikG9wlE`=>A$$=MOjxD^SySvr_0v3kkf&baug)$MGJ!WT3t3}Y$r?;kWt za^HVBajUKpc3{CP&mrUBQ+BOn)_}~k28@;-PJeXn`PtN_P3NgJU8_h|9Dn2)n_gdO z!`Ty=`aUtayoj@^kdtqO;5O#(lIfqttZR$-N#1tedLL8ZLFu^Jyon=(R%*Wq%EZmz znihxUc&69jKM6bMzt0-^Cf(lQX&$^N-HvG`WR_a_v@Axy7 z4MJ>l79q9@ned0x5q$yQ_3UdYsQmFc*f28^Ic$+g}WhM}dEKzkSh zbdD}vjMRF{BydU&A?3FF(>3#bHfOCrg!xoP>BMqQ>m+4qndjGavC3YN@Xh~p@@fg} zgIdB8Ljxn{U9(Tm?x04E3Bv>U_(A<5ZMA3p-d`S-tBmZ^NXv%_ZiMsPRy|;3P8@zg zxZ9atZ6K36&!FWGu?+x%LK^2bX(vilX@Eg;4d z$yFVwCx^U5$By)QiK(2n>Cg;bb4F+0>^>pc&-fioc>86~4HRA1UD-!RcZ@c9FT>W1 z`r~uToZ&YQWJKr+afyj)``E`;p2&xkG$g-Ds0PFo-*O3%>`YBz z%5IH%`g}Us!tG!tE3ONIcP~b$&j~NjFF zz>=VkC}yn{+T1i5iOs#$hDmkej;fx?G{j%I4@n7S*gd{{ z1BD`i-FjuOGw*2*?H@|pou+&AAlRwLG+K-eK--FL%S5b5Vy@fa2Kpdf4c}H)ZcJUc zQ+mpL4hk(6DBa(;hTENnJlb*mxdeMP$}SVzWTQIs&8%ol@7=)jFpJNwKlUX7>`U5V zTiKBv#KljM71(0hbE{f+zH!VxAp#}fUIk)C#dp;mQJ=_n760vW+R16l_u)RJQ?|4z z;^J~>$HF%$4aOVxN^i1C+QQb6u#)GO`cj(c-O@GH&xF`VqmV2Ewb1#S>yYI%X}n%H zMwK|2yOBq#3y(j(Ie0ql7GHu)_6@eHctITF-RVn+;jj>;=r*xH4qMqaGMsN8r+_)JhbE2)=dybkXV&-4G0^M`pcJS7$*r!!jS2(VM zxRH?6G&LQJrE4u|z2`5b>>dHDc zRp^Vq+=eKdzC<%)Q8^FG zg8gA@xHHbDvWIeJ`R}{xQ$}F<>Di7`LV_j?yi1)5Z?yFf2UdPXu9>aK9tjCs8v+3c z4|fm9R67(WMlc@aRlF(vQk%`Kx0pW^(KTN7nE17q@*LmRL{ye+TtZy zi5y&UG#_tEiJ?9q&kM`aM5DN~qhH7@JG|KlZ;nw}oW5t`d=<`*QH8;+>;vrfz1s4{ z2$~QsUFf|+6U}X8%B%jYfv0El&PtV{)p-H)MwghdtFDQYyb8mL!|ukr+;l7KGBFTG zozRStYdy8j2KH4J6fMCh6`y%~h6Hr-G=?Uffq+iHIj2L!v(F|>HfQ=l3pne@ zKgX0mG_l_=x!{_SnJV>1Ic-1v54l_iX%(x_o2C2yJKp+IuKY;+hVJlJ)y+tWUQ~)u z-Nmz>3S-$3tQhU9wm>dRtto2v*hr!hZA&pW2Tjt8-v?8glEYrDow(nTDtX2>*0w`8 zn*CNo_946c_i1P9GPS-q$6muLp{j00Bhwm`8);lY*6ptPxjp; z7;!YaZmFn*3UlQYPZl_hE*pWZ9kqkbr5=;&o6Ad8&=jl?2rcf{mg&5Ne&<*SIr2$1 zS9bkGZ)oUc*wpH*8JUBe`MPO zoUkM7(pYl_8gr1!=3+grdWtH-dED1s;?b&!Xq<$8F|05v{cs)`GE8q1`%9FHK9I ziL-4U4<1gd8`>Xmwyc*EmEq#iKOJ8t;|;vm?B{W%K6p3r{fK4T@ZswvBbSD82eL3x z`nhi_Et|K)X0~KghHKmniRAWAvnkVpR?FoAMe+7R>IG0t8I-a)P@Ctdv>=X^yV;+c z{cPEo9Otz47R~2L$*#Pa@pQQEtK0?%4WCjaGHP_Ertu!RVjKZgXjF27uH)$@BkAIz zjTd#F@O8(YP!jd+f)l(KT&X?1Hh6&jrM^xRoY8 zVl_gy59zWZp987*idp?q(CElep-(vB7`zD6hOots`vWsds+-_rp+5BfT5zBlN5Qp4jUv3Hs`L00OhLB!{o zL@5rnnNXG$)yLuCnk6<{?17yYc1kEsfBb>(AcMSFIH*7)ms%GbHbrmU`EGlE_?-jB zm)AlYY#5M%!*bhuE}Kcqc)NR;B0lNNS0MZ|Hl1~iGTxz~j*im%r9Ig{i8XTx;~?3Z zV(DynrSDpvs@^UKFv@U&K;A3ObH3O`tvivm2DMx)3N_7T~zzZdq;3v+pRFw8iHUpSiY z$-FDT=^ET_u`u|!QTyG@>ZgtW=cqK6YVxocwD6tYbEZ>2g#GjHSLI=Xm~BY>r$2A6 zSAHDiF;V!J=Xlqqsr~-B4rr1BirT10zP|ZuQry)zH}_6y+uxh_&kMgk_-j%RrcC9j zfA}>Vc;X=1&3EVKVyEBlv--m?ocRdusROl9d7fVZmE<_a(kjn8d8d;6Wl_)f{`qg* zRPOW1jNQN7wzGOc!G9A_ZQZ@q+X7U!iih@J^}_N*tNwm!dZDK`1Tt5P%^w>0&rJj} z`rOk)IOy%D4nq{#j!OUX{Bu{-|Ia04@PZnuA`aBpM#?4ObuXyUV zcFWH?&hwu8y?ey9rJ^?j1BA(fj~~tIp!`a2Z|_uuzgUsR^|xth_ibI~1E!|7ukYNx z{^NN$)1)PFYS+v2WbBF*;Lq~i+AvmgB)P++kxPF^iJvS&p3V6y$eD<(LlaKjzDpq>d>d|qBJ>3|q775o27p7*@;rAuDJ>f@?$m&J2DY$ROgcN%uP{ z?o>ta8OH=*C{?HxRbiUl0YS{_LT@0|y@hMb!w?er;osTNf5ucWbw4W*5hANB1*orC zb$iVndMrOr^@oN>Y?CT1+NWj~`{ulTKP_I~v`sFBVD{BP6e(oK`;Wv^Pf@$i3XGY4 z{6HdBw>_L~hc^A$xAGjpWkp{FLyH-wo%s2u9nsxDWmq42;-If?YD-O|c@FxG$F_&a z>NrAWQTK4dSN3(ulrEGsBa~divB=;>GRO3~F&0yI0dFFshdyB7JG5av!YH#iL3Bcp z5QO1nH+DG8*E?NK-}8)w)3nA!NKJ+7-m6DqGm%@ap(IHd=94vPQ#F}UdbY!k#7#KP z@sz%KDRyNxGlh7#nQM>Vq`2SkncugJMgUeg26iqi#)qf>{axP&a(V`sv;#zZ_(?N|7b#vQ-G)R5>Ow;L0ZCxSS_g=p7_bYrZ(>$4_~5LcLa~e zX+!oC4c-!^-HgHx&y5b8SY2P}eri?OO33Nfwqikg3`I%}Di6Fr=j>6Kn>p$JBASkh zHL&T)!yp-WhE@Wl`JMZdB0I;y8A{*C6lU1=z?TRz1!)U|kp1r`|Dg|ynLZ)+ zlXL$*a|33e3x>GYAcGgji=*#dBTHrb8s3={H(|fqPARzyL#&OC%;yJjZy6hgB@BGP zdZ_xZe+_IB+A=9k>m9jL5QH)~pP)CF59p{hqLK#z!} zZrEjV6>3*KFdapgVa)#3b(`i=Q@&6{;eutctM4VFeG>i2Vi@?H3&S${l30z)%o6cZ z^KB2SWJaHe1CC?a$62+B?v1q9jAgU@#WS+26Kxx9yYd{aJ0hth*f!oE|HjFFK)8M! z2$kZ>l$=hmye!p|pFBPG)(T5WVXSpLIl*~l&Csgxkyk79BN3f7V5tXL@}7MNO&+;{ z>YVna;ssg?f#8$Z9B`vjgOp;4Xh*5GLD$p;cE_}}R%-<)yXdaqCBmW$2k zubDA&_&CvnC>uN`9It`wg3d9K9laJtwTG`^PNm5C;L`$3wxk;#AeQr)$wzVgA7!YN zKP^Mxnfplcoz(JqGci#4=v!plGXCy5{@g_;3@Y?ldL17so_&vA5{RH;%jr~;lK=hC z9p~}CYJ2+yTcO(8?Y1+rP-+Q|8S}vp#zo5}fesRm1H}iWG{SxgRX?1HLGZ4!^)Zov zefveeEsPHON{Ls$Kqqf?SbVNxivD9Ktqu6QT+Mf$ieFQ#ep4`fwe_@GL~ zb?U`@?#s86TboQm@L@{C#g?jS!^$?7vb(d4i~78@5LBnYyvwdTYom%$okQB`ZCM%8 z!^UQ@{BhvioSn9IhEo?I_}bmOzP=!BpR(&NE%Nr9!Dly*yX>}ZT3lY<9E>#e^<4Y3 zpPFj#$vriTH1+MpVx^qM8+wf#z@24afVIASG`^d4d?E}_*Pk!}2X2Y|F?h&bkcNY7 zUr;~G+jUB7bA5FXa-?ubaIi_C(~0=9=gkw%@#J3$ zr|%L2NXVj0NaJ7%?RNGi!AQ)k*ViD$<|48>TzM%NtwO0dbs%j=^m$9PyNP+4a zkji&G{5>Lrzw{n`eT;jHI6nqAynjLpVWfa-$aY710%T>d?X*Dn9T8h_H&ACOnA&Zq2)KR@`piGFzU zuh^3y!?55hD$?;&0Wti)LtZb_Y)q-PR2v-qSL{iTw_N}9Piy%hpD572`rjchUhUS> zI%?AVc=BJdrw=zDWou9~mW!Zx`WH;{eWFh|H8VC-00R0SVDL|i`WuscL1UUgO+c2n z{}q!|QJ|dt8{_?_f_#1owC2AHDEH*i(reUe->=kqrPAuR$O3Uh2RckB}4i5eo2=p@yi_UpJ2qPt|d?J#Qrp>4PGw{d%i9wP)zi z_QMB#fP0*n7-t!u08>wrR@l`nz0fez(9pkxr;8jA zH`zv#&X2jb(tM(Z`T=BKRaI3xEDZP(2_rA8St}H{Y7n6m5BJ=l`T)ZpK#WgCGAfuA z*VNqDIgbjaVjzF+65F%sHa7x1U|MZ$Z&Va;HHvo@^B% zml47U^-mMpBG6mq>+4`S8TVRjXnxY3A4(249I_XdSYGNLG%l6)AsBjAVXfZ35G;b6 zYQzYMzz`s?s*#fys3fU8A8q&e2ly+B7y(CHCNAdoJv^W2QrWJ^kET+q=V>~Q9Ao1s ze%>64q!2p3y_;MPIrrME3mWWfO*IEp1_>hRrcsQM+BI0wH=0GNfQ;ClePwHBk(HQ1F(5OT#pVC`aYTBE6V%t``j&{)6Q3P^0UOTgM-|%Z+!Ar=NymFEXbASksfTy3apuc3HJNAGG=jFd!v z|5{GSKW%qMoi6P-X9Rn|v|Y`@YHQo*N=A6Q!xY^WHtSW(Y6E}V-%5>|*r4rnE`)JPFE%^H8LH1x{0 z*MVSHLQgJ>n~N`}5iMctpGw!JhS2MiXur{|h4|pccgw#TuHV|WgN1IiI@f1|(lp9G zpUX)p3ETPfz433C0zvA*VmJ=V&4FhK?8=I(P(Wp*h~)oTE*hKwbyQap7N7~gsaVd*3K?LezJBtEh*g%AE`jaVdSI{NA{pZFvh64WO(RO zr--21zJ(f-P`e}94XC81at7$RzMSrQ-E`OuhuO-_5Tzg`N3YRQ?cK~_FWPGc1Y<>|Woe#{ zbH_=?vR`2(56!{hiZTZMdE%6m_&9ofm?~}W_$D1~UGN*C z?Ckja{R{|?+rRoZ8y>LL?~|}y`k{}n=)Beq536kFj&P~qF%U+0)6&{mAJYAqHd=cTx-&Z9@O6*aAmqDHgzrnQ)d zAB!2yla%0`$5VAXy@}uACBpG+@$S8?1hXyqW7bLnB%iI|4WDo)@xIq2{6U40Nq4jJ z7%{z1NYZ9^k(b>YQE5pJkBNjny4ujVfi0s{E+y3cWgL=lKt1m37&lS+xSrYUXHo?0 zP|*+z#ew&BL&XEdh1~(}k2;M_l1Av$zcopP-Z(YA&|U7jS4#F^*@3U{vL&Yr2a&{V zH$_8IP51@wJ#(`mVD)lbCjDhqvYc01?;O8W|0VbEuNK843JT&wN~mUD?W+E-8p6>C zh{~cGEP|muU_kQdy^yUZ3^|wiX@3RbocSVy60w)_>RFSpH{_-J27T5ib4HblDE{Rx zt~Dki{thU+E&@hwQ|8%%OG&^0j?xuF38~DvukOj<{N|9b6hxs+ zRg3J%imJ;PAu;ht_l(m*lGevm`Oik-DK$BX*ucvshbHz4+{5YR$mNi93i)Pg-VR9P zF^*?Y1MvI=wHcx2K;wH!Be?Q26(XD=n5Fi}vz@^cH$o#?`^XBn)`sMk%NE85;PjQZ zh+l_e#~ki(;P~$?jjz!Y5ctLt(uT}?*(H6rXoi%0@vktyCBV31dMCwS)B<$JW#CIFXu#myc@EjCmmY0uQc)3;S)DL<8?kf(-B_f6=KNRq zs6)A~HG@3{!ta-Yi7i6#)MAFKCou$y8DvbK#2q|Q5>jp4zP2V;ZsRgJx`sc-`(Vn6 z@2N)@R8}t99C@0)UmX_Vb;8->SfHL;eXg%jE=iG5!#OFjRX6Fc8=2+6+?YB=xgt(D z=CUR0GIf<4r61os98XL8)Oy_iV^9)yLYC%X-wN%sDzD+=8>+wA*B)SBdP&$y%i2)ZMJD7;xdNd znNs*1--cw_pC5$cz1VfS9J2P|DhDp)&1C;P>8@9d>{+#ak({{dJfkpxE}cBYkY^-F zD_hwLHLrcZ6DKIS@}(fv1Uk32uXYTc)Geq!60O9u)XQkc64}l8WI{VdNx|)m1jW-? zy$bSD!#N__B};Z=L{VsfVb5HPkjT=JpfLdWrNuK=Epd5G$>D{x*@LmIPG25f#x%N> z+iJ&jXVcy@^H&00+WspJ#i z@vl|2W@oij9;F94Xh*2k1>T6()aY_4`bVx7LD10s(_aVSy5(H~ zaL&Wl5A?gBmvKJt>>O)E!r&FkM!W99*x%`~)`pI|Je0xhAHdxaSt~pKy8E}BL=f~! zKtUJ4Q8pGO6%L$#uPLa4n@bBkFb=PXRhkxW*!drHk5tDfpxhi^>q)8~>kxh_tGxgr zJKiDwhSRGQq@LE`GKjq*R>OBZY4)>N{;tWXaClRv0Vy*)hyr314kX>VtO~4GUD}E?1S4& zR7qt2qdilh0OfZKu0ck2S*AvwsGevRV7c%l5w%VTaZv8QS&wutMsIx$9O!85oyU!Z zbBtKd3MishEyCTrf~iVl|VDYilK3WYkv!qu+%$ zBYUq4(srza4MVoTLy==irCsN_+#K`1= zQUeBhABr%yn48;5+{-}R>Qv??KhD#uXLG)|5|)JRPTD7wwWSK2!iQ!BJKD-XPb=LK zt*g$BpvQh+;EN;6*P0RPmEonec@R<&=q4KzPgfMph~{>;F+Y}#nA{4-Yz78rhO^i0 z#c%r5O9_m$QnaRcL7bsL>Mh0tx1Q{4++Ei1d3von(b^QN`nveHvT3J3C8HI3GMKZG z6R|Nj536vC86X^3DC)?L5+GXZ#&y0f&eZ!zJUpz^@9q0-p}Vi#R2?~#))aPqwKNMF zPPdeZ^+0^0xjy?D9K|~J-Vx>6MIa@m^E-yK3>JMLImSGV)HF)=yDoLM6FF2S`;1}W zJn|W95DkhcG~{8G&V147XPdWLp+>Oc2s|d(apTmXlaa@9`|u<0au4|`U$s1x-(RYH zjqgpp9Zndfnd8*aFkfVK2;Vvz%r^3r9@wHguac(nUq_{G6HQKY%;FmxMLL6>Ml8xf0zBhyg9-{fjdcU6;RP`aLfm(hBNz z&^2NCRO))tNc2QJH=qR|>&j#3Qz9}3(swu(_=vYaPDv>(mFJmQ+{uwYlHMAdl`*D8+_$vxJm}XgdJmO+>GkHVk z4G+hMAbr0l@F(pA4IcV~6+%_f)n7(lAvTx)?gij{=J0T+wL3(_;VYWvBV}p6p$#*0 zmo*#f#w)At()iRVAbGQN3@8LxO62ykB{6iqOAgWzFO_e-VryNisnAs6z8W5Ph!Y{h zu>gWHkT1FP$;0^$1DTR7w9P(}mWIjqBWM3GcuQpvqrfU?9(*q}R`iTdZO8;jB3Uy% zo^z-MiZN&^;E*i=fLGt^9`ZeAwF;9BoQfsD!kC&YfYg@P)B*P*6 z0@^Yu;)w%eSfOjcS;o*XRfFzj?}>yszZd66xu!^0Hi=E9k241MT|KfiZ{w0V%p+60 zfPT>qqE)|^n9WN*PqkmOl&`rUQsG;n5lYRWsSplKm9|w_m|(^|%H|UWy|EK>ucn=# zPj8E|x4G8Tgo_+bu7+Ze!hrzAK@ z@1T)>gxam3*v`!OReswz!cOC$0xF9A)mdsN)^_QQKl>Zt0N^AZ&|e8E`0;(ozpn#q z>-KCg2Y=gNU`N|g1l05Bt7+H%VO9S@VYio_V2Cbmt zJ=Ss3zfe{8h$l6uJ^47JL6JK_iQ9)4Q!;wSjR|KG6TLR_t=4pFtsC%BjXh=>oYW?) zXJSU^%P;d734FMpevwQ*e3%YUF6S@E`a*5%K47JoN5QO`jmu;KqXAm4LO}->V4eW3 zIxsM>MVYS`QIB7yCf2{??+k5~uIHjqaBy)Ry1%E}6bWCQU(B}y^oB@7^I&8C9$(m< z;7?}d!PgSDThS!Q6^o1Eg8HRs*cs&J`fN+f=P1#QjSU|kpAzao?^_fqJ3AY2Y~T@q z+*)5<7&OAb#_Gbk-=_c+m(4r@3z)F$irdr;KwJUW1mIC~z=VPMfZ@URgAV|OJ^!OM z|M=o0Li;MI91NR>HUhW&v%yUfhDc#iIV}R0f^Vz=o=3lIYZEYG5|P5j9PI2RPQ%w^ zktgDtz)=4OStV}dfc2o|??27NIBY{C07M!L*$%kMnCR%}H;ex>DfM-90HFXh zy1A+0QkOT@>}{?U(B$^`Em_)ro}Z;h&y!|$P%fWx$)M6I z1YA@EXvr5EAU9BFh*I_PiZ(Y8(gXjlH<+|7<|{a@p5wmtEG?I(|Vx~%*CXwg3` zY+Jz0NB;}C=7*5P2%S6U@$)_3KvtPRHU9O`LtCh@T)$wU-Y8}4N~ z23iM7obH-?uLEzi)Y}WV?@r9U2b1D4r!;=NMd-WKfy(TBNj-7?(q#`&AZ;yU(`rE} zw_9X~5o(1}VTAnr!J!3lCqM{AU4vmicE5c}g_r^({@tGoT>*V?5DOTVN+$F6E~rrG zxced|2A(>wFSF@5Q;Di#+w?1zt)iT2qp%(TD+Mvey+>22o<-R{C8K~6^M6dOBL#F& zY};B9qb$;bkBCfZR`pHvMhur%8ouX+oMTZ{xo^KF zs%*LG=I^Q8P{>aHx!SrJDizn$vs_s{VbTYGCnx8XMc#nW=^z_;a4Su$tF0oESO@o+ zh%CR>%?-%eA|Y~WOl&8fO15)RXgi4;XZ&Jw)6;>*q98oZ#v=1|g%n|cG zVdLXGI5%MA%<1onE-fo7NGs+q&!|ycz>&6Ad$viWiR``Gt&a8T4|qHX98f2``I6PWWs-H*VRoVEwpgN} z7nMlGQ?Z@I5)vofadrX=tEFYovmw}d-hMv|$Bpjlf~{%ksQS>j$#aj>50|)|l#9mu zPLzok$Sjp31YD9<0Zdk}G()PN*dI#tAqED=nDimfMsi=l;1hUgp0+}LVQ(koCk9a9 zt#$1u?9U|1bR#A^p-^>u1b*_f4OmHY=2a&oo{?QjmhH62)GYT_sj!dEY)0AgvvJoH zjJt!E`KYTS7kk451yg4y^GbBkXfUBW4q>icQ-nLU>OEMr6*VlV*-6D8yxY-ag;wZ& zdi(oN?OM8eMWabV*11(Y&#`M+paS-C5q4jfhgJ>szG?v%Sqs4>dRC6Y+CImAdtynACUsddGq zd%3=CrVmP?_Qpo8{(JA+=uDi%JLJ6y9ipe{BMkGGqH!`GninsAXn`6 zKFw|21+8i}{oVZOcFcKPi6Tclt(K1Km>b_5-B@kiA@NA0rP-`eV4I(FP@8rZSVpXg zT#M*S(wTvLdmNA#Zvm?{j{a3xzZ>?cw&~y2v56S01;Fz{x8cJWB~%hRZg;kWV1$-5 zWb<2m(EhzwH4Mq_L!Xh3>z$*All8yS`RW=mNWjzWCf*B(%aX@)zIgl2%CBZwX}&T- zA5lOt(;$BjXy2Le7=uRfNIb=RP&qUJ4fQ-Cw^mLO*rzyOg;VqjEg_rpY0|yKj^;ZA z)qE-~-1;V;RVYqG`)LQ5V0B~%8@n(vtEldfMZFNoRbs*ZMCHlh*nqX5|1axjK&H@Iq#D@IWTI-CTz`$hp~R50njFy4r|xNnA9Mh(t|`mWZS(*NT~ z()`-Y%=he6@oxCHs9OJZ$b3L2`8VJ|OZxfH>tZP=h3B>7EE4WJ3s zB<}SJ=u`zqw?Ln6#r5mgA3g*ha29ZkiWUAPFTS_#!~j4SdX0nuit*3n`xFK`vzM{1 z9@k`Cwh6F~PEuK!^o2P90kGCogN~@e=gp>HC%|J6R0|$eKbI#G;22v~PR@3IE`7my zi9$lYjuXy~FzX^r-`YWujL?ALs%IP>1>Mhhs`Fdk%U@js`i58Q!UX{+LcZpJP(Qcb z;@S|Kgx*F<{TVJ~Ci2gz05>HF$b9Z|0Q#jYZy zxIsvs1L=iYrwTP4U^51%Vu}N!sCQa#6OQ57-+0LFr)}ev+vmamKh=Br-|P2L;i-Y6 z9jkc>3r5366aG9s1IACtWmSiK-z`jBJ#wjRULja?amJqhf!XzmuR?Ko8Xp!yh{>Vp zkcCmK-6!&dW4vJg+{g>?LM|pd<`4HBaP#hioge#@xpcE?+g?gE`Q%cnP>4jvO^>#Z zx95&4(vzkq*GHz?c*IO%kK+d@i)*ZQx4L|%Y)vQHGx_4)qTO7rNtQ+dYa)%oJ_Q*i zO`3rP=kg(*Onj8gOZ;~3w~yG%#a)IPU1z6~`pAPfbHLkXI_CoC=0`v0U2R!YbX*zj z>_uZiQGDAlMb^_@^};hq;ADYYbEIa5vU+I@TJy9D*Wdudr%xn;tc0v@&IDl@oX*O% zOMJ7ztXScZCOr9 zisuaEDyi~~LbHW*-gb;bWXl*V%nYYV`OUofJ>!CdenN;g`q*>by=iU}S;Fv(7S5%X z0LmMYhc36Ye~Dhgck4&KezIVzms>a|(2NmMJ8h!V0$xtM_lu)k6kUwohr1t#aXD>q zRg(qdnnBIlg_@}+(5Z#d`v;PPaZ(T7FM(HzaN`#8rgcg-mHFZ)LKW#&XI6PU><6n( z>Tg6=FGFb_SRZB$n`^TepN*JIy&@B}dM7$>Dn8 zZtjJ4uAZ6x<6Vv1yeuyY`>^Qb{Ql)GA>#_sAY~PJKOBdMJGZ3Q(ovivtxOIu|a=&UBqOrw)O7LE-p+#NN2eYW<~HA#hoM$Cc~x3%l@gS z+UNqCHBR>*8_1Kj!LPkH(z`!I*mx)S4m&&28YHS;1kcy4##iI!Mwmsf?~fM=&H^u_ zeb>X9)-J4>NM%39iaaVT3+8m%-?=e+@!jni&tyJ>_CDv! zYmI9Wgqr#jeJW&Hp)b+k&2z{p&>2FTS>?u3{`R9mMV;oU{YWgs2hTO9`!c-{@FF`^ z{$&(1JA})-c4%q5;zDQE(_8b0swS?sSL1NjHP<^MBKFs390X@7b7X0{NXLactsUMi z zWMX$$IeEJe5$FOh`vyjqGe%?>KAmxty&zr!X|8X)>WXhx_JldpjYO|5H4G2(3#dj; zm+|KpUsqOE#{VDez4upBUAqNpM@7VfAR?fGAVhlaf&x;c_fV87y@p-{M5U_`dO%9( zp+g`LkS?LOAcWA78hYsEZr<;FcieO2jPv~g_ZP=-jO}t~?WfK;pS4Iy_Cp$pgzZ1? z=lcACQ$a;V#2r(Sd4eBD8V`}4uQPB{ntvr1f86OgC6)<_=RhLS7kb_^nM}%Xy~@j{ zQArY~ZmtZH;+oelMOS-H*~XpoU3YS(kfprd=ru$_zDljc{(S9?Hu#o}IEAwN^Gx*Y zj7v?P7_o0KP&ufVuBy{B0itr zcb1q3zCMRN7R_+4$Rd|PJ_qGqNhA--yB>`tdB$;7Wc{7jt!_rP(P!MBn1j=QzjAC= z6m?7~m_qleMH%)_n*0`7&q?!F-Gs;28o)hzt-yH8v(@X&y6uRPb_`n+L^b2 z^WM}?ymy)-0%u1>M}@*E1^$u%Oq^J(uK3}8t%#1FsFFNxpy%gX`*H4K?A?pT7tITb zDIb(FiGC|!&tC|-#=selGnve-N)FcwtTpZ_3cTI=LHg0BxO*A8@8)@5nUbwH(2#x; zM1;At4o|tqIB{!4vmbk|ixFLNci-;iBvymJZ-#EM~uKugyC^nfrHCb76&cO+BeY_ zHXTZ~NEU^l_B7(%u}|I<^(z=D{3w+S+ZUCe-s>fu5xHuCf?WLp92XnzemhPKN|f$< zHt<%F+a{=&X{wU)EZ4xiOT@Ejix-#<`+lqkGrT>J(i-G%daWn+eh;k@-N<;@8sl> zq7|JJCel=tkP;m<^Ost%9XH3?Mc3zR(kA+B1RP~vrsI@|R^!7g(1V8T3(sM}0z<&GlML1j8u%9WY33c8&1oh3rAiS`48pw z+r-Q`d3lyY%69c`LiXpe1lU8Hj=9npl`NjUAlcr&)fWt7ZCAhU-2NZG_4Kb}CFgnx zFM8ZBWN+q}o_ipTyc2G=VpC4rdMG9@{;K`;xg+q1h|C+cbZK3Ph$atJh&bzhDCp^Y z7~8M#LKG}lJd~Jvq>oi%@}g_8HUIdamKuFO8s}@>ivU>&ml|J3(q`EUA?okLzIYvW z6prfg4RO=Z+|bvf`3P7PDA7@|qX!cmMuuFD785x9~R^>As$0y4dp!1%Xo%n&Da+RzLcE zv3_+`@#hvr`U_{6p$P~N@B_A5Lj;(XgbEvG_Pra%fQOrQ2fEvB?duMMprF<5y&A@~ zH#YW3LqA<--)FJL{h)_x{pQ4;(24OVVI|xLM%QJyqFC%cUUN6XP&1Yqe(vm*OOhIw zWhMsbYW*f{RhMQt;G9a&-v211dB=9aZ(o}AiVf2qkzBiB7S&ol>|TE<=k_}v)S#ag zuzHb~8#j+`L1=-{kHl&qZ}+k;lm8?_~WR+lvfZY=<}g0n&8`kJ2mwGUbMiCC|jK_W z;J_qS53aras`m7v3J(f`AFKCVFM)1beKcgTeYi!>Pj_=MAtPR7tP%4aIKz{`+pg~0 z*6SSA%$J_F)K=euNP?=dn$6lF#;fUMkOft-B~C%Jw)P(8C3E%E3z|ru%cjJO`KAzy zlStqB;VJiL6YoEk1(P}aCk zG)fXiy}7FVs@x7ozG3x3#qK6=;s5H);48EgMtzt5-kElHl;(ve4g!zSwi0e!4 zKkp>)FaLY&{V%ikw}0_rK4zZs3@<-@{=UfhG~5vF_b%7j{S2tqhKX5LJ5H9l(CgWb z0l|H=5}agYiS}sKQxNh=yLdb&gPud#BT{`suvQCLwe-!9SIn?KPVY8q;2H&^A*&+G z>77zj%TBZC`Sz%HUYBX@pd{BXBT^pPjBZ#c?2no{5*`VcC}t!<`!?S{sawmw>qGG{ z$?bopjnh}}`f;w;R&+?jOb44oWdmmlfwmaphOM7dm9m(PXVb%+cHEHhGm$}_4u{); zHS)80UDhxYhg(&wN+{Wger#PQy#)}SuAJDop~QxKtjXfTJqzh$ZGfOxhha%thH|i& zDaXVh^_1p*D;9kAV;r{5bg;MeVc10=X9l(w$R*zXo}{AfW&#FKb6*17b{FlpP zY;wA1qf6!)12+&_O~JEFtdXY2f$Z!Pzl4Q-)4K}GmemwI{)4uuF6 zE(A-l3BFy;P*2&hn^U$hkICJv`n?|5<)t0q(XlqdITmdtwZsYV(CXfG zMEldyeaWj&=fsP&J|r2axc-Dh(y`5meJkVhus-V1S6w2iM|Rn;>2O-R`ZmcHfW#lL ziL#tD|3ZJKA?c(YGKWH{C*73O!0CqpbSxYwg6l{zr2Y2sSSF=ouSlH7eGys@Rr_c!(=0jz&UJ zDkHtB6JO`CV!kV7d8NL57Y$l}zSe;7#;ajx)qDir?@!BvQ#6*GtAhMoJq~(=-bcf8 z1j6hzF%o*t!xGKQo%e)hThzwX7DpC~?RMUmzCBPO2^SH%2ljigme5P+l9I_8r6sHO z5Q$Z*NO8JCbFiettkCW&1?y6TCQ1@@bf<)87>086RTrhO?d2vy8`zH_%|Yk2;LHlD@t4~x2#3b#r<=K7 zRo~t1OG0T_EsGHF`$-g`SJh3XcZ=yZ#LozrKP!PyR=a;SJi?9$+xR24N|=>{Z`?C* z&?*^F%%RS|H~6^FvF`9L`j48nB(Y)uss(! z1mVR8*v36B%R%|(W>l;=NoQ57T9p*@92xABQMo3okp~klayNU9PNP5%E8C12(Mm5a zaTvVmJoC+qEMC+|US^qZ(QUaY=eWl!1Ljgt^rx_ETOvI9LM)ru%^^|+7ZC$8Ckt&> z&TMK4;z|{MBuq`)Wj8tQ>&9DT(0(G7Ak(7s83wki1?1Sz8TJe^PrpS!tr#`Hu!1Tw z`*LOi2Q(XGOMm2kQ;-j_cqmi_S6ZnVtS+R-GbLiL<=d6*K69G47T%wa?zOGNRLz5LcN<&79{03%L)`+!z2s1a=R~H`IrqrEV{PRy>1lTZ z$>T=bS7nf{Y5zx*RsnTy!kD4{@zF-YQm4#g`@3dxVXDl zJS>daNBA6B0;xFnShR0~aj}M=5hY;=6w3G z->w$YhGuX1OXT-RQl5?V+rVie zxhE{vpE;g?y!<4C+$ZZ%gW>BVDT&Mu!3eI(!aMjYvVy)9231UaPjV>~uPPo2HA?(l z@A&V$0g^+AoihD8uXuU1K|YN@7QI~XU`Im)&dU-3=DGLgqi2h!q1;Z`ono&NmPE#%}{6|bdmGQu@OGFmg$AuKCe4(4Q9>BjCQYxc?$i18u9}oR4qEu~v5An^* zRlv@STVMz+n?we}_w+01XVG7IYK+#xprE(uJ%pTil5U_b`D?@8HW?miAA}ALMoFMou&~R`1Wh4d3Ov6Z7{e+mD(KBLY!)3A@w~KW|4CM~Saprh z{PG~xrdsRIWmE4fiLE!HB&R#osT3Uy3Ju5?i>lO;RJOCg@3@!fM_(f$;wz2&OhXN= zuF{P|aiyB90?4W%AaAsrU^1;}Px>e67g}>*DTw*=2zdnhIK-*&f}uIIRl9%aZM5ElOgVGFg4>N{VWHDDokMs+CDS#|?cw z@zRq1SzC@ipV|3s<9}5IRXVv6``W^pGV(a#_dI-V<7* zf>-e}z5`;}`0JQvB_NU2qwu#$qMgvLTdz@3S#VO!zKiwG%k~dNfw$BV`Etj~NKUNwT4rhP#Nj%Xi1@yUoDH#dGreRuKi(&()|5Q1BIXTJ?VcKe2qNiUi&s2e)NieKC=L+9O!Hcww2I%>6HIM&`L*wqCX~ z|F5f$I9-7#Y$CvDUeaa#_b)$n7s15;vq`Dl|5_H~B!7>8TEvbr8UqEd3*dJaQ=AN6 zDo;8fqEm{Z4>J($%_F`$V*3Ttm5xZkx;|kKg~nbzHoTsd%UDD2qCnaFhm+om>)Q_& zY4Yu+LydD#Lp#S`qDDYQiY{d;I~MM479Olioi{<+?rU3u3409NeYOzt#ZdKlFfn#3 z!%HQx+oBr|y-owgr(4FCl<+8YHvPCUy!6yo8b+U5e@0aPDFT3QK9H$WDtrY-2)=wc z<>@>wWK+PQ4`h4oF&_vpPE27f@heZGcCp6w^U`IWd_VmXsDbCw!VM zChNg8VIl=KeHXa{ygVrgwx|g&C@3>}_v+HSjjsI`z1?a8!B;;~6O(K(CJqFM_R8e7 zz0_pGd67>Z+!UAkJB+o+12c<*fr+J*c=0?%Oo*{b(4W(09$tX>^ms(4bHS}A-reQk zy5rn}MJ(^_m%hx1q*@6daIFrEe{(6@OKCPlO=g=_A-M4>WZuy3=##v~27F|{LADrw z@JX5eq)%-;Z6VMr);MZycQRV7%8SIX`!iwC&9FCk?e^cc#c2o_O`$j)N3b2_%8rN6 zv{siq3$&y}K#M#bOcd#yl~NklOBNT<)ff2^kQ+#xG6NyD*e4O=ooX^bv(%u3AmgSaqz~_Y6GI0%d?~l^sR8{tNKsg^Nw~AbPokQSOLO>|_A8WZC){#a3C^QF zYi0Si{t|k|HFx{Ta$wDSwmBXuc`}aeMH?VkIhCMd8a}1Ks>HOJlgSsKuGaL$FKle9 z?pQYCbXFV+FL{jSAQh)bzr?yP`cmdjQp4Igczy1!rcbQ+?qut-=x??BOsFZR3-3C` zF66|nuzu`2_!M+u3%?&cb^oQ3-z&c~$w;S0$zH_pS32xs`ge(`_#CHF=Nk3+?OYw5 z;e`X43OZLghE*36lPvHMoLz zpZiej%iFol`TLc#f^XgRvVx#!uOtr->l!Xu*Y)0BVb2m^r0o8{o~*2CNn<$_yU7d$o~u2OQ?(+Z#V z9B^(p^F9URbksA~R8(zmv}?G`PVSHmG$GgHsU+CVnZ?W~?=|(MRo_j{J#MC?=V>*S z1KeZRUJ83SmhdqPlu`kJmzQ6j_n?+*>PqaxL5hX8zCS>RpLg0@+u-UN(V8IBXv}l* zL0J)EjNIWXm%P6CVfoD;XmCn(Ml_^4^`$e1G%+2AhwICm*%y3r8^#GNdG0x6@iXik zY0ti@0A1h%`4dBxaNn%1lxk)X@CbXR+(u<10pUsst{3!Lm%iuOb$+UOht|(s6|Ns1 z-rhS5s6gOvDM(C741Yzp5C$BR)rkO7SGJXv4e9$M;_xfE?yv`TF~HjniRc7DNAPkq&#a1ccI=cm||Sg;tLFE zR)vCV4Ky;4uc9xi4O2Q%lLW09o^Qh@&B}fZuX0Z>X>j{lg6Zu=%$U8STz*;=Q}U+< zQZL12KsLlF-*3ePlsM4rfx6P;0Wj10-Cl&_)SU|)V{)pNoRsJqVs9jd>q)Fw>^1xQ(+wvtxbRtj);QvfEVLOUAr zKrDFdhD<%g>^P4z7&~rtLor4quRrJg`sx?7sXjN{+ySw`*HAx1SAg?U!(1AVXtJmk z{f*fDd64;dFkF_C#SKRDy-<<8CEWe-tW8|B2Z_Uyh4_Q2pv?Zg5Rp>Kl9MfPu&f^ z-%I$?FYsWHTgLCotWTRI1MWUq^G1*9&fy4!;Ra}v(1&katJ&9X#>kq#&HXpIGiAng zh1gF&o1RM%!-l-0vftly;|6eRRIg!gqPg;6U?Kstpc53%Sqkjf{Yc6FYIo9b4m{b= zr^w$5EzSDUc28R0p*4|Fo1o64`Qi8G{EB{)o z!tq<|`iJk-Ne{8@pmf^+^x-*BRcPI#A!VVbx>1**-|P1V?|4Qkxb~puPclQ)?mnVR zqHTN7QA;1Zoz~Nz^6q7c^g7x|u*Gxs=nqaJf{$EpD38C_XumN@rbjq-EAaUHaqhRZ+cE6_-#BS$o}Mz@vp zXI}|W>a_c$YA?F3*c3Fbf|ncVND@uZOS`HDiU?X(hb*#zBVzBDW%YzhFh-n{9~el|vw$0i6I zyCu15r$`xhrjNe0yN7KE02EnO(`>sD#Vo>Hol8$p)D5bfEV6DZm4u-+Y>o@bn5W6#XFo5PDV8a@OS zJ55b(H$vAOEl_bEv<+WU?es)RrYHcwN^UBbci}nb*IynMRf^>-S`VZRmU1O&1SDa| zo=I_ga}58>>9p`G_ya+JyQ)>;%C!FNVff#X{(oZCstQm8LL4IbNh{Qb6QL~y{bTst zQA{S(;P=*r64NWp!OAKVn#^y8joG}-eTs%k<;DNaY5{kD@)q9`EAJS5S4{o3%5Et% zNzJvk1~M=Bxf43|u|xtdyaN2he|W8p2N1nPQV$UwhG7(l^je{2!ol%@FR|}^P9WM#^8<|?AplC3co9Pf;DOX9DMo3t=pK3 z*!-ou&j%4UmY;kYl}`&VeeN_RC7R(Do7aZSUF^%rx}5a{bT(DvZO84!P!ADkBprB& z3ctN|vG%YExr#u+5J7TwzZ+X@Pw6n?!}kCgcFNUx9cI5jP+$L1C29@IE20*au-!F0 zFXg1QHEL(Vl~R|^Tv}F;{?4nSPjrnVe8S&%pIcFRd_0tmYdiJ|IbNYVb?lgWQ;0iM5bIY{^+8yTA^QPTH)6^@9^#6&g*g}2@4 z3bXUF%hCk^cTc2ZZJD{oBQYHgLoA7+?r=QZd{>_d29xXj_}g~6kI+~?H6v6=8h?1W zpQ_#e=-YbRifTFrb1w$jSRMsb$O~Z)AH(8VwG}SXt7kIguDoM=4>B_=jJ7(E0O`kpNQRqQCnZuKqNy_YJ9y)c@6-#SV56+Ad{Ccp7QW@_RRofxY} za@x8({A&o^d#op?Syr?W$xlU%&ndoXz4kVPh%jy5Cm57YHg97a_02b;oMdy!^5EFd zSQJ!fwOTg{8L*c*XOXA(Yk}+AH7!2M_th(3&|3o`JFhIt)e3gxfjxMRb4k%EoXd|+ z&T4@_*48>vUgK*kLMzeZTR4)Qm)jBDgVeuJ{gtTKx`2j#T!_nqo!^tNtqP%q3M+WqreTRbXBuurwa4N_U(an*{&cuuNX%EG z3<^7KFS_290$~ASORTKx#p3nlhlf}@kKUGopr+%(dHpPwVgO26s_h3ptK)eD+@o_9 zpc$UWKIGH@PS%Kts101>9ft3d!P(i%uWT_^%?k9ypMPURsUaMdVuasy)MDX5y>Tr> zi{>oj$l5Aln0s*ZwEXSU9$1}E4sKHS*=Q!3)#hXIuAXGhDEgvvVpj{K9ijWsG?-i26yeeupVF!~e|+uN!eBzI z`dEi?m5eE8TgCn<67Y_STp(Of*|4Ep`aF1_9gk7X{DJvhYu0ShNv9b#`p`$hjVbLc*5<{8!Kp@u?8)c;C5QgJlHpvY8WFBiCP}#5B zmIYkBfxbJ;t%ITwj_XBKRSXf(zfo>^I{0fqQn*iz#|S;hT3!WZW%~6==v&Gv1e(Z- z`|4w^bS+LvwYw+iE^Q_O{Ngv->0ZRrO`L*oJPTi7#nxtL*yy1gOL~6QCznTE?8^2a z@wK~aZzqh8X5O5M29Bkj+HHot@MxGsrlN#!2UMr$nLKEC9A5o-vSgGh&sPkKRa43V z6gdkp5o+hwrgN=4uk1N^G~Q&ym~XSv$W}AwkPF_Xc~NBxG@gm zk`c32v%74P`nknxG#f{Tr8mBILDELt8tRm^^qqUotzSsqJbcG5Y4Z^B6#_u%* zB{K}cWvX{ZU%B9N49OyIqks80_IT-+qzeRS=NN1B@`?AUZa+Yu2=@}c3ZWCn#HQtDHy>A^U_&;2Lm1lU0$KNtYTGrE7rSHsghD zt!7s%9t^6ME(2XW@49Ok%(Z9@C`kX+HYm>8HyT$=3e0R{P z|9`6|&IKP!-VvBa#xg@Se{&{ZblpNOiV1-FlCet0ZfD!mc}%D&(U}_tJ1^FM126c2!55WIE@$$6T(5X9?TR`48O%8vO9B z&Z`39PXY1K-O;g7&)!o~2nRV0L0tA(8)NG2dO*xC*_=hZWSK<$i?M0Eo_q5;Sto_2 zq1UV~k@2-__ir?~&N5E}(0!4DI{vHzI#-ir-;FYt`uP)mC9+&?niX+D)SwW=oD1^; zO%U@Rp0#CDi7(gOAIa4|ny4)PTBl}J)AO>w(iT*Qwe@l<7fg+&6}!rABfJ+^GU@en zig-KE4t~zj^l6^mL3hLaPisX+*E2bSx7mQtsw7P2B;B?HBB23tXqdg*6cIhzW^eFZ zBcV5B=(KLED)au@H8VFv4usvNUxtmgh8!r!KZWOp)etL4TW9R1Rr4p=h!oiHN$^k( z#OqX~BF9k_5#>s5o%(^#19ay-fR#U*Ua4B?orYT$PS{{DUC;SaJ{#~EF>WlSdcnj| z;b5~I2o1gX-IO!aqP-E&zz`C@JDgoIox5QF?35e%UwDqI|0LDlWNkE`)|VsxhC5F? zCi)Nm2^m8Flj=g!g|lnNn$-Vf1^fFgr6Ol9;C=2tsXe{V8C`X_erE0PXzkN~vO>+f z``s)5%L3gGip9w7bljc9FUO&APt^QxCSCU8n$)_Y)3p$r|X8HU7 z8^gc8%zt1E+hlD3UEwWm&9Wy4jzVgYSNq^9$G1fggMA3N%v@)6v36~Cx698HibVhm zwI)M*Xphg{21I>O5YiBNZP?%*zmc1pw#;MgF|WqecZRhGiQAj^lpRWcL?=t97sR4$ zwnkk33|jpo4tzOc1$7PWd(2o%`F4osOQo6F^p1$XY;r9XFvph7 z=qU8YQ-FqR7}-vB?$HPT!pR%x0T)0E98i9eNOlloAI7kY`I4fAfs&Q>=)nDP-hd6? zZdSjLD}D@T{XU-|OB)#ef47tQ5Li41O$ZSkL377;`;oyfg;9UaL9`)Yi*>J5#?ixx-C+> zG`~Gz$L?zDYRw~WfbtUjp|GD$LzJXiE{fd7Pz474PP`0p%mIYUks%enT=Rm(b&NJ= zSbt5`xn2Zy<^1A#p0N$o+7TKvsQ9NV#@?sMu*RnC%>WBXq;%kh0_A*k?tV6}hmyFW zMJA@RYhB6AQ%c*gsHA@<5I0GT*};Yg9R(xdZVo=v6^?-(UzuC%=dm=uBoFo5aCsy8 z30MKQtnOK=1ek;Q!+3+-7#Zm0`}gro4^XE9z>Qov9jZ_o68!w`9Fu~P3}IU;yNplB zNa8__O;!JM64^wc4Vm|@sphi)E$yTH=Dbc-)J`d0o`5E_-c96``@5>_?Cu+DHScO7*BVqa@`jga~7Nd5|8_aCPqdL z;;YM5+yeYicFtql+}QWVyId<@Af1iki(rTskkTKkd*0z28oEw@M%uDh2W~hXh^DaA zXzR6rddm-M$38evI!)*A2Gn{S4fmaLYM08~mYib&UhAO74YXrOkdes&bTxLM3b^=d ztK3Y6w;W-c>FhZ?G!$=rdM!mj|B?p2xM?hdWQD~ec*z4Y?FDKk?6s|mMu2A6XRfH2 zaVLtzi|jQ_=`Nnt%r7kxo8K`aKU^WEn$J%@asZwM#Q<%!;k@&J5R3%!``N4-tMiU7zSns3y3g$*&5{JVE^D!a4uHsA9T zYk@*SUw1JnEWCD zKS@uUPbJYe-sH_#ZU>%C>Yi_{|C4gl@QieJTDD+$edJtly93uvUS0d-k~A~t`CaXe zW}|dvI4-8je64xWCGOlJ+tC+wEe82pe`cG7!)S^YwLYtOIvVftG7Ej7#w%$BG{D3x zM^0XcxL>KtdYGY$xkTR_$QJI9Fwd`UbtO*{x3l>!?3rv7=1xo(m^dX1`VwiN9h>@W zAdvPb69yR98+HK`mClg%UF%GcqD{I&33hOE=&lW>T5Sfh+x)_2XX2s3&irJw{A@bu zSU&;k z+#L-4Q|uOy!{j|K^FF0j$S7d-{)moM-m5Y-RuB;WMe_<7oQa;-Dx9sc9N3Be*sBa7 zX;RJ#wo6y+AFNMuEPKCG>-uz%uVPDz9VR|5a`kBXsVpQ7p~k8=_l%^i1#|1*-SFC< zPyD^e(8tpNoD68;SC%e!1u_T;N!{W-_y(MK+naUgQsSUCT_&?&i-Wb!8ms^r1J_n!92JiXpFB@bUZQw! zDZRz#=PTl85z+iYi?W03y|z0=l)JmkbAHa8&rq5R?RzzY+AdEX)b2J#kd+zwi612ElodK=VY=*V8ndXQ`$-`!vQd1g(gvCy7LQHtS#{$9EH@NO-wmJ!Q=#kMR6h)YZ|m!jdM3Ws z2wYaY`F^I9ffaT0OVanfbURRQPfi+G`v-uprcG7Awo4yHkvR+GP}tnV3act7i^v|c zqrNmjF{zAk_542Nb~J7!@k<-O3R64D?2Ris#?X;GQM1{Vwl!O}6NSJ5!p|hXvxgX) z$>;`=rFSg#@=QmkJM=t%Z#{q_%AZRdBo|f!f1YRrtU3ah55YY%;}Ld#IM)4x-PR$X zjV&EobM0%omCE=}g{UZ=rs`+H<Kb1o!2h$>o<7>;t1{~_pqQ3Ft zf0(c8q8t;IRbCQafwJJ++-i#ai_m;j=W`i3yRS5h4trBl{a~rYhbzxerxcqlFo08T zkVQC^wr+orwOo+?f$8GFg1aP3^*S=shhAJa)HQMUrf?}>vOtDx&Vf!pdz!FHExP(Jf} zzc!E3T)<8qW}QC60tRMoN9W3bS-0D`d2J4o6@JRiJ&_A=h6#ia7|szplZFFEEy~kd zqhU8rUT=$K>&I`<`?TbO{eE4ExNQFyX#Eoh$N~vG1iAscH+q%^&Hb1^bK2yc9qRxS zrt25?kFwB#Z#E~2!`@>WU2fdgxKR=r_UCze9u~}hbz9o*>eP#jC0oV%O@G8779t01ccQloB7WTA`$toF<)U9gRsL1Gp z5d+}MRcv)$!W*Q=!`G0?fEtf4-FUg@Pntm*lgkANK+?oCz>m29un#{j2QEyNVF7u~ z&B$5H2{WLeJR<{=~Zr$BI9n4|>>SAu5qsx{(vF_LhH?5?9g)G3u%t6iMJKt9< z^$%UXOE;h}W(Mt2FIoU3- z&G3{BYqwwkYiYG9!1sDaUl1w8jKgs)L_LLU5&=dcq1P7u zL!+RE%EXp!7T~QC64CaBvlLFV*a;0Bu%iDIjAW!0NNM z7nw`66FFk)mTUGSd}zOZ?j<|!y}L}{;~l%GQI|RpzHjY?S)q#6EA)vN?_oGka)fs$ zh}U_yqBojrCDhx{N~LecGx%;g4dhd>FG`RC=4zIDJorSV#>-O;?3wN_4Vt|Q5o2MvR-g|eWTqkBE zI{7cU4cJ$#{83}|axR}01ZJE@pTgOnFYo49qu+uK7S&9~|0~@i@42cSC!1d1!D0mh z#y#^+@_wMbVxtqDVO?Pr|BPFE7hR_Kw-x}Z`@5HrybLAE0<;sLdBk@B4Hn;)EWnQB zsd%kyCl^xYzdc|k-J@BOo%Xr*QSdL7{sbRDadaL|g-IiUDVAW(7G`Ays%TV&r?*z$ zR{@1erL3*Yz3Bx#kMPhFn%_b5+q92T%=?nfX*QLOmLULR5&u195M{RtP(wg|C<*^q ziY3St^=$+7x!PXH^lGg$^X7(TUpUUm#fLY1QzOE6E~my+$jnBe*1SPeoXR?&wZzThJBOQpK`~_g2##W{EUU9fcn_4iKl;-kiOF2 z!!M$4d09@kI~+v;d+nEgv^h8P!%3Qoe(feP&9Dp#5z{Fa?67_1eDua0f6K~;qbB*@ zPJyg*`A$vkM0f;y=cLh9)j6}=R}!d>hF_pATK>J4d&#w?xsme6eBa|)1x$AS98hI; z#9)+$s;_VLcpH*;Kk+gla&Aco{f_HiqRz|@2EYH)|)Wfv8H_~xBHNy1|3d+!I4WCZ9G4apy-yN%P1*aNcG`2>?iJ1FB&Q5;v07 zQ=t)tec-9uGC-VaG(fyg#jmrgA{*nAu=$voMj4ja>oFM%*<7%Az3c*eUbFV}H&t#z zr}I4k>pW9y;>VU)J3d^i^^N;#IJ^d7yl2j7jn;oo$CB{C&ZJ8zN#pCSg=jXjZgzWm zVSoevR%9#U<=kxFdp&j~*;fJPciCT~8BGSu9<;S}q30Lq-m8Q0of$PWiWNxBIpmWKTy49w~$n-_0*X&MUvQ{=*WO)cI+q z{2vND;GmTaSFsriOazo!1%;`_OQV^3!JG5SrRdzGUm9#-S&f-wpMM|rmNq1sln8T= z4*eEi$&A&xY18%lDfS}{F&>>j* zN;O+8-DWnZ9h=EuhpB-}i}P9q;#KGUkOfQrGobFRj%zEOvAAtFZPP0ULOZ1Qc9;q%y+ z&+?x-fhACbfsr(O<{KEx_Be^dzQ3& zo)c31hgaGUx!Y=$ z4Pax}LkD^Ib%;GNe)QGyxZ7wF(ggAA_Llh_b*26Ks1DJbSSz0glnE}Z^aot=qpRMj zWp@j#tz)>~)mkbJOMWwUz!u>8XDKr2Wpch|h+=KgkF+o3C~Dl*lI3rq#om!ZZ^X!X zFC_1L1c_a`Qg})Dcsep{p5GDVro7#!a5<%rk~_XiwmyfzZk7%SaC*`b6&)rXCU|uF zh|9gv+v;m+@7J4knR%(DNwDG~RoO#Nc488M21(x({>?4CnljO|H;qDcJ7ZV}+(3J4 zUIDe>2?pe3Cz-8OVLk7kuwtve%7|Ao$s9-(a=S8eL=B|G>CJ*sDvf(HS;}E=>ft=L z($p}&EAw#DPU2T<0n;_!bm&lV`(lU?dEn}e3@>hvt@7vTLhe-P&V_<#LQrV@q0`#A zf+)!Jw_vBk_oI#Tu5K(;NlMH4jkVH#4@Y6mifcnMOmRj(ehwA}a$LDX(!1nA! zslm9;vP9G~ie~GV=SPM>%Mmgr(g7Q5?HO-hq%&=K_sDj~(9wXr!`WuLUmMk~ z(!QHhViF9RK=HeV=}rD6TNrabp9bEnJM4VH<~|Ozq|`|>pM+6Ry|2Um$~z;KD;AaS z5*}48Hb2!R+c$>j?a6(uS!i|^j$LmT@pM+WS@05GB(&;-FE);Qjq)5!eyGc_wR!!O z4ETzIdiY*~DD5=E0zt1n-=6+bpIfy1@8dbmuis35Kk%D7m;?Nv z7g1ohKo-*LNr7Mdvbpa$FS-@P_2bxQwpYz4ot!~Q!u8|CpRM^v^&Qks7?g?3OW+&| zV9Xt>e)C@NLKK>6aQ+L&=yu`~&5^_LF98x%j3cXlwHuZoRObE4_n& zroy~E(Q9AQR_>`^&CJ!Yj^LDPH`mFmm&|^LMU&g&N{mdoz^vpIlMD`-!hl>MUP<~b zmS1jcxka~q8SbnsAw>C@UhqPU-W|7k(TQZbuyDO}O40t)@tsQ5qGtO3(3hQWTx5)Q zh_&yz#vVGLYre`7X!*6bE^I%ovufpOI|^)#W}E3&*w@oha4|gDGzGH`?x&9<>*aQS zebZbUOdR%)l-Z)BIM`ul>?$aaE)69Qfe~F1H@b74vuyX~ZVU!5qfwd!>*ujVy6f&rkom)Yae= z4KCH|Jc2>z@e0XG-gjtHtzIxvn9$%CMIB!&x%{INhth1L10Fs9<{*9d?Z5(!x>T!7 zaNs3<|G7tV*^gIU4}M70BUW01KUUMXN%hv!^Y}$mnz6Z_DBoZABH;>oP7a|N0}$0a z>I3MiZfPB1g%HhWhiNiX>$GPM~m-(5<>DEZ|nF#;@^JRoa(VTRqeoL35 z-*UJScgJ_`^Ge?i{=@C2B_ESkZ@N8q7^zk%W<83B(QK~p&nGr{g8k5;2o|TiMkm5V zU9cV=8kuze+wS;_oPJ|$udj=Ct)ErXa3BNYXAXCXv7R5)f>x5n>Q&!kpm-RfPdcX& z))mLsO0vT^_5}Ao3Ej0ZfPG7NS+0Hjt6eQzfOteS+<+=RX?XjZvXIOOLIWaOz}hcn zNiR@LSMq~)0u{dB5r-QDgmu6W71MEbviRD0tqVxXXSJmFy;`&~bzo3B0f*Q@TqQJp zDCr$H9o4wMG|v4$?R|GtQ&|_UA_|TL1QkcBqJjlb5KvmghEf%j&_P8|Ne!cxyfWa3Q(5XP>?I z*=L{M*(H9@?ETz?o>K4jwoPAlw1v5Vz~XJk*U5VlR2uXfCnn~S;l2l)_03)hYuK;G zz06F2X;mjs1A~`8VM<&W?QXac5>p~;q$Z*@bfr@HwN-FUKH}opqW0W~p|fIp-G{(J zlydumwvxNG$D-sqE`BL6JCDiA#x|tYyj+|FH>%>?r-j#Cc+U@qlILOZNqZ>9$}-sYvlqGwg2hv z28%6jKfOr@s!o&)DiZfJS!7*1j^}K-aD1OJ-d8E5Ow|G=e|p?nefh}^8f_W}vr6-J zZ@;Lv0^}WtyX0}@jclF)n}U=*SXi2 zU34dN-Ral^3&`DsL{-~DcT4@CIL;TUybi|oe0P!g%={S2nD$$-+?ZAc2xqRw1^ks@ zZT974^+)yg1id}Rw;!mSUauQIUGrGyVDD`^eEs>db9#gHkz_cc`@u5K;Z<4oq+g!1 z!}aFh=QX+d-6Sh+9An`k%u-M92f*N)-m2?o@q8nR_r0&Gmn}OongZSx@Ct0lwdRWN zTkex#;BN4pcg{=~-Wy}>3!uKfO1|?vqCF)b>&x)-BHJhJt69^=uA9bd7G!BA>n@x^ z!Luax^-E5!7;fRbx>(BId;FjwX1kx~PWK`1Ief@JA@VA&8i;wWSi`yMGpE$*hw~h4Ucm2G-osdeMWx0GP0r@>?*$H~ZLVL=wf82=O*gEsGtIUW zpnp;v2gg54YzU5@qrwi+&sinqv6QHx-q$bSX;!eHg5fcdXO0e+*4p_VoSikAt$Dsi zvS9GV9q)$lQ%*|AXl)_23v9%K+kvh8_bURlZe7;%;-6^9t#H3yE6D22Z7kk76SfZ6@_S); zr;n~6PPneIHe)$p`OxONT$u7j5BBDcoSv0nZ5@4%=M&tMnc~DJbM1o?tn=0(yBt5& zo#{wHrt<_T@k%ZYs(LL0`Nb{)neEqAR2VWMR0|mxAuWY1*YN(Sy^7l{yE&2Qe+Hb( z-vzH#reRm&+3=f5-X{06caHO13j@Xb$+}xljFqLih3wj$BXw2>%x;i9TAW?%m_GFj z_6e8}9Xqnms`W*?sVL*XT0oZv0*pO(Cy(c7z>WE;#}OOkcs|$zAx+f1VEX$eSDC`l zdmWaiva~{q^zmUftKx1K6~T>5(27U3EXMj=_vz$WiCDTdJZ)Azw+XA$UUI5lp{hPUIZHgMV?lydtv|EZdWN53Kwmk6UlqVoU}R#C$9Dg*n{O74En4 zLcsH?WZa{vt-f`k6Ajl^2>TmaD-Ud->W|$OROq4`YS)(&KWs^bafUb}m43*O62MjO z0{3Og!Tf|qruL__>BZt+4hAuXs95VWy#&4NPHZ`RAyc(8KciR*#&&JIeJ%l3p;Ms- zfBJ3&-XK)oay9SS&8x;U7xNLcgGdo+{9`|2ZZ~Zfej^Obo=yLNF?{AwXszaKS%(Sl zyYj09h{rMNGm{hNIG&U*;HjNkpHSpC!<<_I;*b@m6%Mxz1kVQ7GE4k z{+qx5S=q~F0vx&qaiX$>7JOlzBDDFNs2f3CNvhe;yrXF-NHq0Zpmx!bcK&xuxnNZ$`}(Kn`@kOk~O9=L-68b2t2Rw#Y9cwSb> z9U{|Fz=Qw$^%cNel+;Fs_<`?KXxIz76fp>B_AH4k1b0xUrh3|na%&eY5()zrk`S=`;4w|(|6ur`ZIdm;F<;*b}FjiBPe9} zLlz!Pp>FZ%gj`4pEp}5hAu6a5VxQqd`ozZXAzP)!5Sf>g#A>ajx z!DHbmRKVcYgXY7bVb+hyFjV>PrHuNV6A*QZK!qBSn#)~Sz;G!Hez4y(Qyv(JSYV1| z^rt6|bb@1YMN%Lu3WBysUt*3917>{jVbjsD)T^^B)1&w=UCzaWEPaB|=q^YitA(9p zWEaQ`3}$01SqvtaKW7}q_w7D016AcrFe@V=pcN6s?*eqder?Iu5dcW0oS6Xu;_gUb zUaE!pr0QzSWxTVDtZYy@M(^PKm7Dz%ln6~PiOKJbaVw$R_?bEmYM1@7Ucby zOP_EA6UMI*u;dv5%~I;p8WO*269rtL5sWN_bPlY*4`0R-`Po}Xm61D6>=Q*F~R=y-5pAB@dq}d*4l}g zvb8pA06yn;r{ETjz3Hze@du4)&MyWp`MGKqPv4K@^*Ww6f)rT)DuMMR+ZcxtMoRfT zzt=@rV@B5_`LLvpAXUIMWkl;5&r)0KKc?z`rgCxg`@1SBBb^mhHUXXCNFg0BR-vB7kXM&;AQ_(w4;C75!7-kn+$=J+vKPqWyWnXaHD8Zgv_0TKX3E|D=u#;}~3ck*)nRNh4m{`#LtF!{kY9)X$4 zC^Amt-UOy+oDte`5rOj-iNAPK_+$KrjHmty84IQcgv7IB;l2y`R37J~KX(`tV2B0) zPKZCZHod(NJ>6CzGoS=Kr|_o1kR)a*r^%T&d%5VuZ|f+0PA*)M?95LA4xNKE#pqt4uxdGM+_WGa=@&&*gZ?+r8p1?|B6Awa`voCOed z{rSAuM1`=0=y}F&O@GW$6xZeWuV4@@e*y;7ka)G2Re@raXGMEyXh_L63DUKC;t=wU z?undjrj8_zapya8ftKJR+Sn~a*u5RGM);WLXBh|72~2AW|B(%UUM*uTXM?B*v_J8u zyR~MlV@wz<2j{7s6MXpdur8)-Ys&U4GEt4O5m;aodrSJ|6v3O>h&B%%C0-Q+gcZ?) znw|N(aW@cKg@!U9x6ssci(1H_1aDF~5D$hp>4jC!`IMJ=^9_0u^0&)T42WDQ>ggV7 zv4~Gn9?%L1L<%R( zSrm;h)%f+zm@-G_`TTmzZLjhJxQRf^pgW99W3ki#&-nzH6()4SJ+y<)qVmFWb{II4 z`}rMhxQH356y{NThSS1$H1NTOOSS-_R4$4i#}d33&{G(zv1a~sE~t8aENeM_Ag<`nKGO=4K2Wp4Qk53>v+

1s4)RPwK<#{pTPb$MA zrj6=JUh`#R{naVdBz!SYwuQm-pgqC&1T_Qm?j_R=K|IYe%Lv29k?EJ5rI%ifQ}yQO z-9t!PU)cOxcKGZLNJ(y6bfyygw>B`pfmr^tG^zI&vuM6omQ0YzqtZg}HmgI9hpp~~ zTuv_Gg2UB!y|%>zPz-nkvW#oimY#(uHUXZ>kS0H2TmvU|(C|$C74&ZcO$3wKYw(jc zzGgUOP3Ui5o2I{*SB2j$@Sp!qY@$6rmA+(&^vjb+4bKRP@&G!8>S;gxs%mP%V#mkb zR$fBaT2^mfG4-hN*l`=@Kf>-*R=l|9cH609-+;<~%CQyWhYt&>sI7W7er)r@0P*|n zurs?8C1vN0Cf?jl9E*z)z(6qSaGLUN0pfzPuoD8do((7!YqNc&Xn!&m=k{%zkEQSrHTLSjQhcZ-XYmF>{Hmp000 zw6w=d?6RCNy$2NZi@AmoEekLaMnIMmuFE5_u~Es>6ZSdlBOK~oqDK`H%$3~SvdPOV z^T?y6XZ9F4+1kE$RUbtu$ZO&azna=#bF7*Xb!{ExQX_OVG&S}0(Q3)5&8|iH`8nrX zo|2_>2Rc1VJaQ~m6FdFN-d??0c4pT;4UK~b=b4o9AV!}rRLd^$%Brp)hPk=<4mY zdOFy;dp7>RV)%>JC6UjUKPl}ZgD?KFJ>j$tv8V$4ZdX0ct|au7HUZsxN#;JPysYr_ zp7nk`6V*96%4z8)IpCwUJw1aH6BF9h1CU&^di3iQ_>_O%nS16i2fMn9#cvOv^S^L} zfG*TFy@Uv|$DZk|r!ms?YA+V=uCF(_d2=S&bjJ<#)9Z8E)80gx9mAdYWb4nLQgE_U5Op`%&wFaNaCH=M|$M6-U?&d(U&k1K72`zR;Fl;duKTz3#zbP9?;)W zh;-}IT}@$$dq4Pq=%A?eYS5SeEw-^lIe^{42AQr!G?)p$PsU1=36SH+ox>en*~yQlsA%EfdryqQXiF zloi_B4Rg@iN1!<2{6l)3B`OV}SRrcsBPIzavM8&+lIh>9Mz0fC(to2DeR6IE6e|{m z`map-|3bZT=>IvPMxJT~?M4vPvN)t56Uo2d$Im9$y?Zy7Wd!Ewgs(pcWeOFIbPOhUaPNKhCYE$j9t^B^+N117ph_FN88A&rf$SDPtWEv?XahD?4o|w7D|CZ z=>G+MX;$w^hd0cDTtc$#>lb_Wz0Rjan`O)^$d-Sdp*2W}l@1N64z z7fA8_pl6Rj+4uB?M?QIjh3x(K(MkPaMy)FxZu0j0MgL-_e5OmY@M!0?m4C1MuYZ}_ z5;-{Gwd01hJWO@tFRgU3leN6BKmXgm8cDu_+T8^?qQZg59%N-q>*ahzRLX<&w_(e^ z?NuzUApQt^LGO27uoUzccdpz!S-u_09EI*tpcvyJ$r3d`L|B6`4wYf7F@j+Fp(_G+ z5~@e-5`nWw{`h2DRTTqLX+{b`dMV$fH;|M&m@pODNdoKlvQ!$V4})n&anl`draF<8 ze*FAPBu$452(5`61~-X}o1;qG;G==0tBw%QR4rY98^I2<%posq3`9flL*&#U0)=rh z5y7_aW1XCwaTDr_>so? zHI=A|!E!02Iar%4bRU++tZAk^W?GVcW8g66nlms%H($9h{kry3 z8Vt#&)+3!Q_T&0a{&B=8+8&KxkT#0LOoA=UPlN}uy671J#7#2XkM&KLi@V+#5h7kP zr{7PWaCi`J(bro^Y*oVM>MitDlLh!OZRumQJh*)mc%!_9v?a(bXErQQYKOwqx5 zeu#{WJkTEaE|xkk)=F=r#+M>5PUe8GZs}D2h&2r#v&=VUm`lOpGN2T8asU*#BOPJJ zdzz?PqgJD|4aWCQ)xdCvvEGtj02;Fs-`8;5#3)SBLwp@J)jMFE_Fskt{(p7ufN%$lyxGuGUj zPB+o8ZSD0_w#%^k?z*cSG5+VCdcL%vdc~^GHaQPx{h=|EPsycleq|Ggz_8WSZKN~9 zt5-VU>1n>rAqf zSd3>)s=oVS;=(K3fF&n|UEow%R6oV8{c;vx!)IG}J??Tx4jrtas8G^l7_KN@lp|xX z&7MrHDjlB*O@Pw;3`1}dYxX5Rc$nI!^8{ZR=fyo?=7?bgSBSC)<&}cGbjidnZ`6PV zfF8nP2l{do!WaSEg*B3oo3xf6(G7e+*{px(L9D-zJC58YAH;0S(4YjOC40M#f&w|) zB#YBj6a)FnmDM&477&vapPLeDr6U5nsB!R~bdJ?y^x@(1`Z(YdD||f#s2|7D@<2Tf z@6qvQO$YPJap>8MZ1OyS6w}!|J(L)@g$Oj?#L3HgA>%)EoQZ7 zH>IJqsr;8`BjmRgR34O3%kxI-+YQ&(^pX|&%Jy!|*grWwqit1KHS5mE*d!A&lMrZR zeHn_zZY6Lt`*K%zV{CsgkmOsTr}H24kUEFEFlC>`z!Waniy>0IGqm;|*sr4#y1%Iu z3|K-&6cKl^LbO#M#7iWZ*C?)Dy&u;V@aj`g0a9(Lc|hQrsE9>YY2m6~RhHMNKhE z+UUHrRZ0W0a=p3nGTyKbu^jTeNKC$auX`ybZ2axV7=T-szEbxvASb_N7{Z&JTM3G> zpP~}LZHt*IWIgQ6{8ZRBth$gz7(e!Q;EJ_E3^5RuD_mX8BjDV8_%ER~V zq=Gbg*gn1XX9ZD5YMk_ETBvr?iZ6g(A0R`X6!5s@df*7nN)lrdE{98?XOop5lF+n? z{F;YJo5L@2-&N|#Be9cFqk&vSP43&bSZAa`&z;OWlDYjrJ-uK};DyMH$$3xM9VsYZ z1GVY|o&(utUidf3+o0=n_e;?iO{k?GZS|uwnoJXEORxvTUN1~o{-cv@kS+Tm)LO0H z`6I7s?J!kb50YZakKCs0@@HXY5Dr(>C(wEM_d{j$3{o^5l48UUkKT@*L@0Opqt7oO zqNoDJ`@dvUJ+Ccn0Y&j4{J;8npkF$wI_@O-wK^oE?#(PVtnc*ni$G{E6ofgTIYKq3~cSg*15meha9b27{mq7yA3_hCR-NH1z)_l zC80LEE!z8uz8ciT)qX#;Bdrpk<29_T+S=OKcWdWOca#(Cc5M{<_>`O*dmJ3mdKb;| zvPzP@gUthz90?7LgoFfVDlv@RBnt^s3BO)9$N563t*w32(BQDEPtLIM5HMO-upSX5!%{X zH?AI5O+mkW0YZ8HdF@13^@F-8scC5oL(C= zA<<6-@wh{IHhYqc$7$0;?~bc=k+pO^EI_PqZBGFnWR^A7dUs;o&=cqhWUd4;eYbbd zx;%(IN|~~!xe16AX6b31_82o1JM>@G{>l$O^gyWbRu4#I#u~}qo|oKX+<7AOgQ7x| zrNwLenfA! z+DhspD9sbz{v#&+X3RxggV6oRO2M7YzgpPJZQ)T+q7w&&`Y#7*!NLFk3l$V9u)=Gi W%r8Mxf7cT5=j1W-qXmaue)}I{>dlJ) literal 0 HcmV?d00001 diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py index 59bfee77f60..1d3872a1783 100644 --- a/profiler/advisor/interface/interface.py +++ b/profiler/advisor/interface/interface.py @@ -13,23 +13,31 @@ from profiler.advisor.analyzer.cluster.slow_rank_analyser import SlowRankAnalyze from profiler.advisor.analyzer.cluster.slow_link_analyser import SlowLinkAnalyzer from profiler.advisor.analyzer.overall.overall_summary_analyzer import OverallSummaryAnalyzer from profiler.advisor.analyzer.schedule.dispatch.timeline_op_dispatch_analyzer import OpDispatchAnalyzer +from profiler.advisor.analyzer.schedule.syncbn.syncbn_analyzer import SyncBNAnalyzer +from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_analyzer import SynchronizeStreamAnalyzer +from profiler.advisor.analyzer.dataloader.dataloader_analyzer import DataloaderAnalyzer +from profiler.advisor.analyzer.computation.ai_core_freq.ai_core_freq_analyzer import AICoreFreqAnalyzer + class Interface: supported_analyzer = { "schedule": OrderedDict({ - SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer, - SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer + SupportedScopes.SYNCBN: SyncBNAnalyzer, + SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer, + SupportedScopes.SYNCHRONIZE_STREAM: SynchronizeStreamAnalyzer, + SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer }), "computation": OrderedDict({ SupportedScopes.DYNAMIC_SHAPE_ANALYSIS: DynamicShapeAnalyzer, SupportedScopes.AICPU_ANALYSIS: AicpuAnalyzer, SupportedScopes.OPERATOR_NO_BOUND_ANALYSIS: OperatorBoundAnalyzer, SupportedScopes.BLOCK_DIM_ANALYSIS: BlockDimAnalyzer, - SupportedScopes.GRAPH: FusionOPAnalyzer + SupportedScopes.GRAPH: FusionOPAnalyzer, + SupportedScopes.FREQ_ANALYSIS: AICoreFreqAnalyzer }), "communication": OrderedDict(), "overall": OrderedDict({SupportedScopes.OVER_ALL: OverallSummaryAnalyzer}), - "dataloader": OrderedDict(), + "dataloader": OrderedDict({SupportedScopes.DATALOADER: DataloaderAnalyzer}), "cluster": OrderedDict({ SupportedScopes.SLOW_RANK: SlowRankAnalyzer, SupportedScopes.SLOW_LINK: SlowLinkAnalyzer @@ -66,7 +74,7 @@ class Interface: if render_html and result.data: if hasattr(analyzer, "html_render"): analyzer.html_render.render_html() - analyzer.html_render.save_to_file(f'att_advisor_{Timer().strftime}.html') + analyzer.html_render.save_to_file(f'mstt_advisor_{Timer().strftime}.html') return result if not output_dict else dict(result.data) diff --git a/profiler/advisor/result/item.py b/profiler/advisor/result/item.py index fa0ffb5b1c7..02db7fdd004 100644 --- a/profiler/advisor/result/item.py +++ b/profiler/advisor/result/item.py @@ -15,7 +15,7 @@ class OptimizeItem: @property def headers(self): - return ["problem", "description", "suggestion"] + return ["category", "description", "suggestion"] class StatisticsItem: diff --git a/profiler/advisor/result/result.py b/profiler/advisor/result/result.py index c7d7da8663c..0d0602ee56c 100644 --- a/profiler/advisor/result/result.py +++ b/profiler/advisor/result/result.py @@ -93,6 +93,9 @@ class SheetRecoder: if data not in self._sheet_data[sheet_name]["data"]: self._sheet_data[sheet_name]["data"].append(data) + def clear(self): + self._sheet_data.clear() + @singleton class OptimizeResult: @@ -110,12 +113,12 @@ class OptimizeResult: def add_tune_op_list(self, tune_op_list) -> None: """ add tune op name to tune op list - :param tune_op_list: tune op name list to be added + :param tune_op_list: list of operators to be optimized :return: None """ - for op_name in tune_op_list: - if op_name not in self._tune_op_list: - self._tune_op_list.append(op_name) + for operator_name in tune_op_list: + if operator_name not in self._tune_op_list: + self._tune_op_list.append(operator_name) def add(self, overview_item): sheet_name = "problems" @@ -148,6 +151,9 @@ class OptimizeResult: logger.info("Save problems details file to %s", Config().analysis_result_file) self._save_op_file_list() + def clear(self) -> None: + self.data.clear() + def _save_op_file_list(self) -> None: if not self._tune_op_list: return @@ -173,9 +179,9 @@ class TerminalResult: def __init__(self): self.width, _ = self.get_terminal_size() if self.width is None: - self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"]) + self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"]) else: - self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"], + self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"], max_table_width=max(self.width - 20, 180)) self.table.hrules = ALL self.result_list = [] diff --git a/profiler/advisor/rules/dataloader.yaml b/profiler/advisor/rules/dataloader.yaml new file mode 100644 index 00000000000..2bb7a4c0e70 --- /dev/null +++ b/profiler/advisor/rules/dataloader.yaml @@ -0,0 +1,9 @@ +# unit is milliseconds +dataloader_duration_threshold: 10 +problem: "Found slow dataloader, cost {dataloader_duration} milliseconds for one step while profiling, normally less than {dataloader_duration_threshold} milliseconds." +solutions: + - "Please check the disk I/O of your data directory. If you are training model in ModelArts, please move data to '/cache' or mount a more efficient cloud disk for better I/O." + - "Please check if there are any other multiprocess operations in runtime that may have affected the dataloader, such as training process core binding command 'taskset ...' used for launching the training job." + - "Please check the format of your data, avoid file format like tar, tar.gz, zip." + - "Please set 'pin_memory=True' for your dataloader." + - "Try to adjust dataloader parameter 'num_workers'." \ No newline at end of file diff --git a/profiler/advisor/rules/sync_batchnorm.yaml b/profiler/advisor/rules/sync_batchnorm.yaml new file mode 100644 index 00000000000..d65bcb0d4a1 --- /dev/null +++ b/profiler/advisor/rules/sync_batchnorm.yaml @@ -0,0 +1,41 @@ +problem: "Found {syncbn_num} SyncBatchNorm, which can lead to slow python task dispatch and frequent communication between devices and finally reducing training efficiency." +max_syncbn_num: 20 +solutions: + - enable batchnorm: + desc: "disable SyncBatchNorm by remove the code like 'torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)' if possible." + - enable efficient SyncBatchNorm: + desc: "replace the 'forward' method of python script 'torch_npu/utils/syncbatchnorm.py' in your runtime environment." + efficient_code: | + @staticmethod + def forward(self, input_tensor, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size): + input_tensor = input_tensor.contiguous() + input_shape = input_tensor.shape + input_tensor_ = input_tensor.reshape(input_shape[0], input_shape[1], 1, -1) + sum_val, sum_square_val = torch.batch_norm_reduce(input_tensor_, eps) + + count = torch.full((1,), + input_tensor.numel() // input_tensor.size(1), + dtype=sum_val.dtype, + device=sum_val.device) + + num_channels = input_tensor.shape[1] + combined = torch.cat([sum_val, sum_square_val, count], dim=0) + combined_list = torch.empty((world_size,) + combined.shape, dtype=combined.dtype, device=combined.device) + dist.all_gather_togather(combined_list, combined, process_group, async_op=False) + sum_all, square_sum_all, count_all = torch.split(combined_list, num_channels, dim=1) + size = count_all.view(-1).sum() + if size == 1: + raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size)) + + mean, invstd = torch.batch_norm_gather_stats_update(input_tensor, + sum_all, + square_sum_all, + running_mean, + running_var, + momentum, + eps, + count_all.view(-1)) + self.save_for_backward(input_tensor, weight, mean, invstd, count_all.to(torch.int32)) + self.process_group = process_group + out = torch.batch_norm_elemt(input_tensor, weight, bias, mean, invstd, eps) + return out \ No newline at end of file diff --git a/profiler/advisor/rules/synchronize.yaml b/profiler/advisor/rules/synchronize.yaml new file mode 100644 index 00000000000..ed105b345c6 --- /dev/null +++ b/profiler/advisor/rules/synchronize.yaml @@ -0,0 +1,8 @@ +problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream, {slow_synchronize_num} slow SynchronizeStream cost {total_synchronize_stream_time} us." +max_synchronize_num: 20 +slow_synchronize_threshold: 10 #ms +solutions: + - disable ascend launch blocking: + desc: "please check your env 'ASCEND_LAUNCH_BLOCKING', if ASCEND_LAUNCH_BLOCKING=1, please execute 'unset ASCEND_LAUNCH_BLOCKING' and then start your training job." + - modify code to avoid synchronize stream: + desc: "please try to modify your training code to avoid synchronize stream between cpu and npu." \ No newline at end of file diff --git a/profiler/advisor/utils/utils.py b/profiler/advisor/utils/utils.py index 84419b67087..83f304c2d3c 100644 --- a/profiler/advisor/utils/utils.py +++ b/profiler/advisor/utils/utils.py @@ -1,5 +1,6 @@ import inspect import json + import logging import multiprocessing as mp import os @@ -11,7 +12,7 @@ import traceback import types from functools import wraps from typing import Any, Set - +import ijson import click import requests from requests.adapters import HTTPAdapter @@ -43,7 +44,7 @@ class ContextObject(object): def debug_option(f): - return click.option('--debug', '-D', + return click.option('--debug', is_flag=True, expose_value=False, is_eager=True, @@ -413,7 +414,17 @@ def format_excel_title(title: str) -> str: title = title.replace("(ns)", '') title = title.replace("(%)", '') title = title.replace(" ", "_") - return title + + # 将kernel_details中的列名转为与op_summary_x.csv中一致 + kernel_details_col_name_map = { + "name": "op_name", + "type": "op_type", + "accelerator_core": "task_type", + "start_time": "task_start_time", + "duration": "task_duration", + "wait_time": "wait_time" + } + return kernel_details_col_name_map.get(title, title) def format_float(num: float) -> float: @@ -550,3 +561,50 @@ def get_file_path_by_walk(root, filename): file_path = os.path.join(root, name) return file_path return file_path + + +def check_path_valid(path): + if os.path.islink(os.path.abspath(path)): + logger.error("fThe path is detected as a soft connection. path:%ss", path) + return False + elif not os.access(path, os.R_OK): + logger.error(f"The file is not readable. path:%ss", path) + return False + elif os.path.getsize(path) > const.MAX_FILE_SIZE: + logger.error(f"The file size exceeds the limit. path:%ss, MAX_FILE_SIZE:%ss B",path, const.MAX_FILE_SIZE) + return False + return True + + +def parse_json_with_generator(timeline_data_path, func): + result = [] + if not check_path_valid(timeline_data_path): + return result + try: + with open(timeline_data_path, "r") as f: + if os.getenv(const.DISABLE_STREAMING_READER) == "1": + logger.debug("Disable streaming reader.") + file_parser = json.loads(f.read()) + else: + logger.debug("Enable streaming reader.") + file_parser = ijson.items(f, "item") + + for i, event in tqdm(enumerate(file_parser), + leave=False, ncols=100, desc="Building dataset for timeline analysis"): + func_res = func(index=i, event=event) + if func_res is not None: + result.append(func_res) + + except Exception: + logger.warning("Error %s while parsing file %s, continue to timeline analysis", traceback.format_exc(), + timeline_data_path) + return result + + +def convert_to_float(num): + try: + return float(num) + except (ValueError, FloatingPointError): + logger.error(f"Can not convert %ss to float", num) + pass + return 0 diff --git a/profiler/cli/__init__.py b/profiler/cli/__init__.py index eab13571c58..e768e4cb86c 100644 --- a/profiler/cli/__init__.py +++ b/profiler/cli/__init__.py @@ -1,4 +1,4 @@ from profiler.advisor.config.config import Config from profiler.advisor.utils.utils import Timer -Config().set_log_path(f"att_advisor_{Timer().strftime}.xlsx") +Config().set_log_path(f"mstt_advisor_{Timer().strftime}.xlsx") diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py index 2e173dc8708..f400a265b7b 100644 --- a/profiler/cli/analyze_cli.py +++ b/profiler/cli/analyze_cli.py @@ -83,9 +83,6 @@ def analyze_cli(**kwargs): help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof") @debug_option def analyze_all(**kwargs) -> None: - # 当前compare_tools必须输入两个profiling路径,att-advisor有等价功能支持输入一个Profiling路径,后续替换成对应实现 - if not kwargs.get("benchmark_profiling_path"): - kwargs["benchmark_profiling_path"] = kwargs.get("profiling_path") try: _analyze(Interface.all_dimension, **kwargs) except RuntimeError as e: diff --git a/profiler/cli/compare_cli.py b/profiler/cli/compare_cli.py index e794578da8c..f9add948ea9 100644 --- a/profiler/cli/compare_cli.py +++ b/profiler/cli/compare_cli.py @@ -32,6 +32,8 @@ from profiler.compare_tools.compare_backend.comparison_generator import Comparis @click.option('--enable_operator_compare', is_flag=True) @click.option('--enable_memory_compare', is_flag=True) @click.option('--enable_communication_compare', is_flag=True) +@click.option('--enable_api_compare', is_flag=True) +@click.option('--enable_kernel_compare', is_flag=True) @click.option('--disable_details', is_flag=True) @click.option('--output_path', '-o', 'output_path', type=click.Path()) @click.option('--max_kernel_num', 'max_kernel_num', type=int, help="The number of kernels per torch op is limited.") diff --git a/profiler/cluster_analyse/README.md b/profiler/cluster_analyse/README.md index deaebb6cde5..fdd43ca965f 100644 --- a/profiler/cluster_analyse/README.md +++ b/profiler/cluster_analyse/README.md @@ -86,7 +86,7 @@ experimental_config = torch_npu.profiler._ExperimentalConfig( ### 交付件 -集群分析工具的交付件通过Ascend Insight工具展示,详见《[MindStudio Ascend Insight用户指南](https://www.hiascend.com/document/detail/zh/mindstudio/70RC1/GUI-baseddevelopmenttool/msascendinsightug/AscendInsight_0002.html)》。 +集群分析工具的交付件通过MindStudio Insight工具展示,详见《[MindStudio Insight用户指南](https://www.hiascend.com/document/detail/zh/mindstudio/70RC2/GUI-baseddevelopmenttool/msascendinsightug/AscendInsight_0002.html)》。 #### cluster_step_trace_time.csv @@ -156,25 +156,25 @@ L列:Preparing,指迭代开始到首个计算或通信算子运行的时间 #### cluster_analysis.db -解析analysis.db或ascend_pytorch_profiler_{rank_id}.db生成的交付件,根据数据解析模式不同而解析不同的数据,可以使用Ascend Insight工具展示。 +解析analysis.db或ascend_pytorch_profiler_{rank_id}.db生成的交付件,根据数据解析模式不同而解析不同的数据,可以使用MindStudio Insight工具展示。 #### stats.ipynb - 数据解析模式为cann_api_sum时生成,保存在cluster_analysis_output/CannApiSum目录下。 - 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群API耗时信息。 + 可使用jupyter notebook工具或MindStudio Insight工具打开,主要展示集群API耗时信息。 - 数据解析模式为compute_op_sum时生成,保存在cluster_analysis_output/ComputeOpSum目录下。 - 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群计算算子耗时分析(将集群所有计算算子进行汇总并以图表展示),集群Rank计算算子耗时分析(将每个Rank的计算算子进行各自汇总)。 + 可使用jupyter notebook工具或MindStudio Insight工具打开,主要展示集群计算算子耗时分析(将集群所有计算算子进行汇总并以图表展示),集群Rank计算算子耗时分析(将每个Rank的计算算子进行各自汇总)。 - 数据解析模式为hccl_sum时生成,保存在cluster_analysis_output/HcclSum目录下。 - 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群通信算子耗时分析(将集群所有通信算子进行汇总并以图表展示),集群Rank通信算子耗时分析(将每个Rank的通信算子进行各自汇总)、Top通信算子信息展示。 + 可使用jupyter notebook工具或MindStudio Insight工具打开,主要展示集群通信算子耗时分析(将集群所有通信算子进行汇总并以图表展示),集群Rank通信算子耗时分析(将每个Rank的通信算子进行各自汇总)、Top通信算子信息展示。 - 数据解析模式为mstx_sum时生成,保存在cluster_analysis_output/MstxSum目录下。 - 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群场景mstx打点信息,分为框架侧、CANN侧和Device侧三部分的打点信息。 + 可使用jupyter notebook工具或MindStudio Insight工具打开,主要展示集群场景mstx打点信息,分为框架侧、CANN侧和Device侧三部分的打点信息。 diff --git a/profiler/cluster_analyse/common_func/file_manager.py b/profiler/cluster_analyse/common_func/file_manager.py index e7e2d5adca3..380192f87be 100644 --- a/profiler/cluster_analyse/common_func/file_manager.py +++ b/profiler/cluster_analyse/common_func/file_manager.py @@ -17,6 +17,8 @@ import os import csv import json +import yaml + from common_func.constant import Constant from common_func.path_manager import PathManager @@ -60,6 +62,23 @@ class FileManager: raise RuntimeError(f"Failed to read the file: {base_name}") from e return result_data + @classmethod + def read_yaml_file(cls, file_path: str) -> dict: + PathManager.check_path_readable(file_path) + base_name = os.path.basename(file_path) + file_size = os.path.getsize(file_path) + if file_size <= 0: + return {} + if file_size > Constant.MAX_JSON_SIZE: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + + try: + with open(file_path, "r") as yaml_file: + result_data = yaml.safe_load(yaml_file) + except Exception as e: + raise RuntimeError(f"Failed to read the file: {base_name}") from e + return result_data + @classmethod def create_csv_file(cls, profiler_path: str, data: list, file_name: str, headers: list = None) -> None: if not data: diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md index d81ce05f447..b40f19e92fa 100644 --- a/profiler/compare_tools/README.md +++ b/profiler/compare_tools/README.md @@ -145,6 +145,8 @@ python performance_compare.py [基准性能数据文件所在路径] [比对性 | --enable_operator_compare | 开启算子性能比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | | --enable_communication_compare | 开启通信性能比对。 | 否 | | --enable_memory_compare | 开启算子内存比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | +| --enable_kernel_compare | 开启kernel性能比对。仅针对NPU与NPU比对的场景。需要使用性能数据中的kernel_details.csv文件。 | 否 | +| --enable_api_compare | 开启API性能比对。需要使用性能数据中的trace_view.csv文件。 | 否 | | --disable_details | 隐藏明细比对,只进行统计级比对。 | 否 | 说明:以上开关均不设置的情况下,**工具默认开启所有的性能比对**,当用户设置了以上开关,则按照用户设置的开关进行性能比对,示例如下: @@ -174,9 +176,13 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文 MindSpore场景仅支持**总体性能**和**通信性能**的对比。 +比对结果分为打屏和performance_comparison_result_{timestamp}.csv两种形式输出,其中打屏输出为概要信息,csv文件保存详细结果。 + ### 总体性能 -总体性能比对结果以打屏的形式呈现。 +#### 打屏结果 + +总体性能比对结果以打屏的形式呈现时,字段如下: | 字段 | 说明 | | --------------------------------------- | ------------------------------------------------------------ | @@ -196,6 +202,54 @@ MindSpore场景仅支持**总体性能**和**通信性能**的对比。 | E2E Time(Not minimal profiling) | E2E总耗时,计算流端到端耗时。当存在Not minimal profiling时,表示该时间存在性能膨胀,会影响通信和调度耗时。 | | Other Time | AI CPU、DSA、TensorMove等其他算子耗时。 | +#### csv文件结果 + +总体性能比对结果在performance_comparison_result_*.xlsx中OverallMetrics的sheet页呈现时,示例如下: + +![OverallMetrics](./img/OverallMetrics.png) + +表头字段说明: + +| 字段 | 说明 | +| -------------- | --------------------------- | +| Index | 指标。 | +| Duration(ms) | 执行耗时,单位ms。 | +| Duration Ratio | 执行耗时占E2E总耗时的比例。 | +| Number | 计算算子的数量。 | + +Index列字段说明: + +| 字段 | | | 说明 | +| ---------------------------- | ------------------ | ----------------------------------- | ------------------------------------------------------------ | +| Computing Time | | | 计算流耗时,计算流所有event耗时总和。如果有多条并发计算,计算流耗时对重叠部分只会计算一次。 | +| | Flash Attention | | Flash Attention算子。 | +| | | Flash Attention (Forward) (Cube) | Flash Attention前向算子下发的所有Cube类Kernel的总耗时,一般为执行该算子核心计算的算子。 | +| | | Flash Attention (Forward) (Vector) | Flash Attention前向算子下发的所有Vector类Kernel的总耗时,一般为插入的转换类算子,如TransData。 | +| | | Flash Attention (Backward) (Cube) | Flash Attention反向算子下发的所有Cube类Kernel的总耗时,一般为执行该算子核心计算的算子。 | +| | | Flash Attention (Backward) (Vector) | Flash Attention反向算子下发的所有Vector类Kernel的总耗时,一般为插入的转换类算子,如TransData。 | +| | Conv | | Conv算子。 | +| | | Conv (Forward) (Cube) | Conv前向算子下发的所有Cube类Kernel的总耗时,一般为执行该算子核心计算的算子。 | +| | | Conv (Forward) (Vector) | Conv前向Vector算子。Conv前向算子下发的所有Vector类Kernel的总耗时,一般为插入的转换类算子,如TransData。 | +| | | Conv (Backward) (Cube) | Conv反向算子下发的所有Cube类Kernel的总耗时,一般为执行该算子核心计算的算子。 | +| | | Conv (Backward) (Vector) | Conv反向算子下发的所有Vector类Kernel的总耗时,一般为插入的转换类算子,如TransData。 | +| | Matmul | | Matmul算子。 | +| | | Matmul (Cube) | Matmul算子下发的所有Cube类Kernel的总耗时,一般为执行该算子核心计算的算子。 | +| | | Matmul (Vector) | Matmul算子下发的所有Vector类Kernel的总耗时,一般为插入的转换类算子,如TransData。 | +| | Paged Attention | | Paged Attention算子。 | +| | Vector | | Vector算子。 | +| | | Vector (Trans) | 转换类Vector算子,主要包含Cast、TransPose、TransData算子。(仅针对NPU数据) | +| | | Vector ( No Trans) | 非转换类Vector算子。 | +| | Cube | | 未识别出Flash Attention、Conv和Matmul的Cube算子。 | +| | SDMA (Tensor Move) | | 拷贝类任务。 | +| | Other | | AI CPU、DSA等其他算子。 | +| Uncovered Communication Time | | | 通信未掩盖耗时,包含卡间等待时间。 | +| | Wait | | 卡间同步等待耗时。(仅针对NPU数据) | +| | Transmit | | 通信传输耗时。 | +| Free Time | | | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间,因此包含拷贝时间(SDMA Time)。 | +| | SDMA | | NPU为除Tensor Move外的拷贝类任务,GPU为所有拷贝类任务。 | +| | Free | | 排除SDMA的空闲耗时。 | +| E2E Time | | | E2E总耗时,计算流端到端耗时。当存在Not minimal profiling时,表示该时间存在性能膨胀,会影响通信和调度耗时。 | + 可以采取最简性能数据采集的方式来减少E2E耗时的性能膨胀,示例代码如下: ```python @@ -213,7 +267,7 @@ activities配置仅采集NPU数据,不配置experimental_config参数以及其 - 当Computing Time耗时增大,分析**算子性能**。 - 当Uncovered Communication Time耗时增大,分析**通信性能**,若通信性能分析没有劣化的通信算子,代表通信与计算的并行度较差,继续进行NPU的集群性能分析。 -- 当Mem Usage增大,分析**算子内存**,若没有明显占用较大的算子,则代表算子内存申请量没有差异,问题在于内存的释放(持有时间过久),可以使用tensorboard或ascend insight继续进行NPU内存的分析。 +- 当Mem Usage增大,分析**算子内存**,若没有明显占用较大的算子,则代表算子内存申请量没有差异,问题在于内存的释放(持有时间过久),可以使用TensorBoard或MindStudio insight继续进行NPU内存的分析。 ### 算子性能 @@ -300,3 +354,29 @@ MindSpore场景暂不支持。 步骤1:查看MemoryCompareStatistic页,找出内存占用差距TOP的算子。 步骤2:查看MemoryCompare页,搜索内存占用差距TOP的算子,查看具体占用的子算子。 + +### kernel性能 + +仅针对NPU与NPU比对的场景。 + +kernel比对结果在performance_comparison_result_*.xlsx中KernelCompare页呈现。 + +按照Kernel(Kernel类型)和Input Shapes(输入Shape)分组统计,统计信息包括: + +- Total Duration(us):总耗时,单位us。 +- Avg Duration(us):平均耗时,单位us。 +- Max Duration(us):最大耗时,单位us。 +- Min Duration(us):最小耗时,单位us。 +- Calls:调用次数。 + +### API性能 + +API比对结果在performance_comparison_result_*.xlsx中ApiCompare页呈现。 + +按照api name(API名称)组统计,统计信息包括: + +- Total Duration(ms):总耗时,单位ms。 +- Self Time(ms):Self耗时(排除掉子event),单位ms。 +- Avg Duration(ms):平均耗时,单位ms。 +- Calls:调用次数。 + diff --git a/profiler/compare_tools/compare_backend/comparator/api_compare_comparator.py b/profiler/compare_tools/compare_backend/comparator/api_compare_comparator.py new file mode 100644 index 00000000000..bc5810068b0 --- /dev/null +++ b/profiler/compare_tools/compare_backend/comparator/api_compare_comparator.py @@ -0,0 +1,32 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.constant import Constant +from compare_backend.utils.common_func import update_order_id + + +class ApiCompareComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + @classmethod + def _aggregated_api_by_name(cls, ops: list): + ops_dict = {} + for op in ops: + ops_dict.setdefault(op.name, []).append(op) + return ops_dict + + def _compare(self): + if not self._origin_data: + return + base_ops = self._origin_data.get(Constant.BASE_DATA, {}) + comparison_ops = self._origin_data.get(Constant.COMPARISON_DATA, {}) + if not base_ops or not comparison_ops: + return + base_aggregated_ops = self._aggregated_api_by_name(base_ops) + comparison_aggregated_ops = self._aggregated_api_by_name(comparison_ops) + for op_name, base_data in base_aggregated_ops.items(): + comparsion_data = comparison_aggregated_ops.pop(op_name, []) + self._rows.append(self._bean(op_name, base_data, comparsion_data).row) + if comparison_aggregated_ops: + for op_name, comparison_data in comparison_aggregated_ops.items(): + self._rows.append(self._bean(op_name, [], comparison_data).row) + update_order_id(self._rows) diff --git a/profiler/compare_tools/compare_backend/comparator/base_comparator.py b/profiler/compare_tools/compare_backend/comparator/base_comparator.py index 330fb871ee1..8012dfae944 100644 --- a/profiler/compare_tools/compare_backend/comparator/base_comparator.py +++ b/profiler/compare_tools/compare_backend/comparator/base_comparator.py @@ -21,4 +21,4 @@ class BaseComparator(ABC): @abstractmethod def _compare(self): - raise NotImplementedError("Function _compare need to be implemented.") + raise NotImplementedError("Function _compare need to be implemented.") \ No newline at end of file diff --git a/profiler/compare_tools/compare_backend/comparator/kernel_compare_comparator.py b/profiler/compare_tools/compare_backend/comparator/kernel_compare_comparator.py new file mode 100644 index 00000000000..13c0f776af6 --- /dev/null +++ b/profiler/compare_tools/compare_backend/comparator/kernel_compare_comparator.py @@ -0,0 +1,35 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.constant import Constant +from compare_backend.utils.common_func import update_order_id + + +class KernelCompareComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + @classmethod + def _aggregated_kernel_by_type_and_shape(cls, kernels: dict): + result_dict = {} + for type_shape, shape_values in kernels.items(): + for shape, kernel_data in shape_values.items(): + kernel = [single[1] for single in kernel_data] + result_list = [type_shape, shape, sum(kernel), len(kernel), max(kernel), min(kernel)] + result_dict.setdefault(f"{type_shape}{shape}", []).extend(result_list) + return result_dict + + def _compare(self): + if not self._origin_data: + return + base_kernels = self._origin_data.get(Constant.BASE_DATA, {}) + comparison_kernels = self._origin_data.get(Constant.COMPARISON_DATA, {}) + if not base_kernels or not comparison_kernels: + return + base_aggregated_kernels = self._aggregated_kernel_by_type_and_shape(base_kernels) + comparison_aggregated_kernels = self._aggregated_kernel_by_type_and_shape(comparison_kernels) + for type_shape, base_data in base_aggregated_kernels.items(): + comparsion_data = comparison_aggregated_kernels.pop(type_shape, []) + self._rows.append(self._bean(base_data, comparsion_data).row) + if comparison_aggregated_kernels: + for _, comparison_data in comparison_aggregated_kernels.items(): + self._rows.append(self._bean([], comparison_data).row) + update_order_id(self._rows) \ No newline at end of file diff --git a/profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py new file mode 100644 index 00000000000..d438dc41d56 --- /dev/null +++ b/profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from math import isclose + +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class OverallMetricsComparator(BaseComparator): + + def __init__(self, origin_data: dict, bean: any): + super().__init__(origin_data, bean) + self._row_style = [] + + @property + def base_info(self): + return self._origin_data.get(Constant.BASE_DATA) + + @property + def comp_info(self): + return self._origin_data.get(Constant.COMPARISON_DATA) + + def generate_data(self) -> dict: + self._compare() + return {self._sheet_name: { + "headers": self._headers, + "rows": self._rows, + "overhead": self._overhead, + "row_style": self._row_style + }} + + def _compare(self): + if isclose(self.base_info.e2e_time_ms, 0) or isclose(self.comp_info.e2e_time_ms, 0): + return + self._rows.extend(self._bean(self.base_info, self.comp_info).rows) + for row in self._rows: + self._row_style.append(ExcelConfig.ROW_STYLE_MAP.get(row[0], {})) # index 0 for metric index name diff --git a/profiler/compare_tools/compare_backend/compare_bean/api_compare_bean.py b/profiler/compare_tools/compare_backend/compare_bean/api_compare_bean.py new file mode 100644 index 00000000000..55e08a86be8 --- /dev/null +++ b/profiler/compare_tools/compare_backend/compare_bean/api_compare_bean.py @@ -0,0 +1,47 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class ApiInfo: + def __init__(self, op_name: str, data_list: list): + self._data_list = data_list + self.name = op_name + self.total_dur = 0.0 + self.self_time = 0.0 + self.avg_dur = 0.0 + self.number = len(data_list) + self._get_info() + + def _get_info(self): + for data in self._data_list: + self.total_dur += data.api_dur + self.self_time += data.api_self_time + self.total_dur /= 1000.0 + self.self_time /= 1000.0 + self.avg_dur = self.total_dur / self.number if self.number else 0.0 + + +class ApiCompareBean: + TABLE_NAME = Constant.API_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, op_name: str, base_api: list, comparison_api: list): + self._name = op_name + self._base_api = ApiInfo(op_name, base_api) + self._comparison_api = ApiInfo(op_name, comparison_api) + + @property + def row(self): + row = [None, self._name, + self._base_api.total_dur, self._base_api.self_time, self._base_api.avg_dur, self._base_api.number, + self._comparison_api.total_dur, self._comparison_api.self_time, + self._comparison_api.avg_dur, self._comparison_api.number] + diff_fields = [calculate_diff_ratio(self._base_api.total_dur, self._comparison_api.total_dur)[1], + calculate_diff_ratio(self._base_api.self_time, self._comparison_api.self_time)[1], + calculate_diff_ratio(self._base_api.avg_dur, self._comparison_api.avg_dur)[1], + calculate_diff_ratio(self._base_api.number, self._comparison_api.number)[1]] + row.extend(diff_fields) + return row + diff --git a/profiler/compare_tools/compare_backend/compare_bean/kernel_compare_bean.py b/profiler/compare_tools/compare_backend/compare_bean/kernel_compare_bean.py new file mode 100644 index 00000000000..df96addc4fe --- /dev/null +++ b/profiler/compare_tools/compare_backend/compare_bean/kernel_compare_bean.py @@ -0,0 +1,75 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class KernelCompareInfo: + def __init__(self, data_list: list): + self._kernel_type = None + self._input_shapes = None + self._total_dur = None + self._number = None + self._max_dur = None + self._min_dur = None + if not data_list: + return + self._kernel_type = data_list[0] + self._input_shapes = data_list[1] + self._total_dur = data_list[2] + self._number = data_list[3] + self._max_dur = data_list[4] + self._min_dur = data_list[5] + + @property + def kernel_type(self): + return self._kernel_type + + @property + def input_shapes(self): + return self._input_shapes + + @property + def total_dur(self): + return self._total_dur if self._total_dur else 0.0 + + @property + def number(self): + return self._number + + @property + def max_dur(self): + return self._max_dur + + @property + def min_dur(self): + return self._min_dur + + @property + def avg_dur(self): + return self._total_dur / self._number if self._total_dur and self._number else 0.0 + + +class KernelCompareBean: + TABLE_NAME = Constant.KERNEL_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, base_kernel: list, comparison_kernel: list): + self._base_kernel = KernelCompareInfo(base_kernel) + self._comparison_kernel = KernelCompareInfo(comparison_kernel) + self._kernel_type = self._base_kernel.kernel_type \ + if self._base_kernel.kernel_type else self._comparison_kernel.kernel_type + self._input_shapes = self._base_kernel.input_shapes \ + if self._base_kernel.input_shapes else self._comparison_kernel.input_shapes + + @property + def row(self): + row = [None, self._kernel_type, self._input_shapes, + self._base_kernel.total_dur, self._base_kernel.avg_dur, + self._base_kernel.max_dur, self._base_kernel.min_dur, self._base_kernel.number, + self._comparison_kernel.total_dur, self._comparison_kernel.avg_dur, + self._comparison_kernel.max_dur, self._comparison_kernel.min_dur, self._comparison_kernel.number] + diff_fields = [calculate_diff_ratio(self._base_kernel.total_dur, self._comparison_kernel.total_dur)[1], + calculate_diff_ratio(self._base_kernel.avg_dur, self._comparison_kernel.avg_dur)[1]] + row.extend(diff_fields) + return row \ No newline at end of file diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py index 122009b9045..c15396e9c59 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py +++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py @@ -1,8 +1,9 @@ import math +from decimal import Decimal import pandas as pd -from compare_backend.utils.common_func import convert_to_float +from compare_backend.utils.common_func import convert_to_float, convert_to_decimal from compare_backend.utils.constant import Constant @@ -11,9 +12,12 @@ class KernelDetailsBean: self._data = data self._op_type = "" self._name = "" + self._input_shapes = "" self._aiv_vec_time = 0.0 + self._aicore_time = 0.0 self._mac_time = 0.0 self._duration = 0.0 + self._start_time = Decimal("0") self.init() @property @@ -24,12 +28,22 @@ class KernelDetailsBean: def name(self) -> str: return self._name + @property + def input_shapes(self) -> str: + return self._input_shapes + @property def aiv_vec_time(self) -> float: if self._aiv_vec_time == "" or self._aiv_vec_time == "N/A": return float("nan") return convert_to_float(self._aiv_vec_time) + @property + def aicore_time(self) -> float: + if self._aicore_time == "" or self._aicore_time == "N/A": + return float("nan") + return convert_to_float(self._aicore_time) + @property def mac_time(self) -> float: if self._mac_time == "" or self._mac_time == "N/A": @@ -40,6 +54,18 @@ class KernelDetailsBean: def duration(self) -> float: return convert_to_float(self._duration) + @property + def dur(self) -> float: + return convert_to_float(self._duration) + + @property + def start_time(self) -> Decimal: + return convert_to_decimal(self._start_time) + + @property + def end_time(self) -> Decimal: + return self.start_time + convert_to_decimal(self._duration) + def is_hide_op_pmu(self): if "mac_time(us)" in self._data.keys() or "aiv_vec_time(us)" in self._data.keys(): return False @@ -66,7 +92,7 @@ class KernelDetailsBean: def is_flash_attention(self): return "flashattention" in self.op_type.lower() - def is_cube(self): + def is_matmul(self): return "matmul" in self.op_type.lower() def is_conv(self): @@ -79,9 +105,18 @@ class KernelDetailsBean: def is_page_attention(self): return "pagedattention" in self.op_type.lower() + def is_trans(self): + return any(trans_mask in self.name.lower() for trans_mask in Constant.KERNEL_TRANS_MASK) + + def is_cube_kernel_cat(self): + return self.mac_time > 0 or self.aicore_time > 0 + def init(self): self._op_type = self._data.get('Type', "") self._name = self._data.get('Name', "") + self._input_shapes = self._data.get('Input Shapes', "") self._aiv_vec_time = self._data.get('aiv_vec_time(us)', "") + self._aicore_time = self._data.get("aicore_time(us)", "") self._mac_time = self._data.get('mac_time(us)', "") self._duration = self._data.get('Duration(us)', 0) + self._start_time = Decimal(self._data.get("Start Time(us)", "0")) diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py index cef6bb07124..245b51d105e 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py +++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py @@ -114,6 +114,21 @@ class TraceEventBean: def is_torch_op(self, value: bool): self._is_torch_op = value + @classmethod + def is_sdma(cls): + return False + + @classmethod + def is_page_attention(cls): + return False + + @classmethod + def is_trans(cls) -> bool: + """ + 暂时没找到GPU判断trans的方法,暂时都是notrans + """ + return False + def is_m_mode(self) -> bool: return self._ph == "M" @@ -199,11 +214,44 @@ class TraceEventBean: self._name = name def is_conv(self): - return self.name.lower().startswith("aten::conv") + return self.lower_name.startswith("aten::conv") def is_lccl(self): return self.lower_name == "kernel_aivec" + def is_fa_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return any(cube_mask in self.lower_name for cube_mask in Constant.CPU_OP_FA_MASK) + + def is_conv_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return self.lower_name.startswith(Constant.CPU_OP_CONV) + + def is_matmul_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return any(bwd_mask in self.lower_name for bwd_mask in Constant.CPU_OP_MATMUL_MASK) + + def is_bwd_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return any(bwd_mask in self.lower_name for bwd_mask in Constant.BWD_LIST) + + def is_cpu_cube_op(self) -> bool: + return self.is_matmul_for_cpu_op() or self.is_fa_for_cpu_op() or self.is_conv_for_cpu_op() + + def is_vector(self): + return not any(cube_mask in self.lower_name for cube_mask in Constant.KERNEL_CUBE_MASK) + + def is_cube_kernel_cat(self): + return any(cube_mask in self.lower_name for cube_mask in Constant.KERNEL_CUBE_MASK) + def init(self): if isinstance(self._event, dict): self._pid = self._event.get("pid", 0) diff --git a/profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py b/profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py new file mode 100644 index 00000000000..544f8f5234d --- /dev/null +++ b/profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py @@ -0,0 +1,255 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from math import isclose + +from compare_backend.compare_bean.profiling_info import ProfilingInfo +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class OverallMetricsBean: + TABLE_NAME = Constant.OVERALL_METRICS_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, base_info: ProfilingInfo, comparison_info: ProfilingInfo): + self._base_data = OverallMetricsInfo(base_info).overall_metrics + self._comparison_data = OverallMetricsInfo(comparison_info).overall_metrics + + @property + def rows(self): + rows_data = [] + for index, base_data in self._base_data.items(): + comparison_data = self._comparison_data.get(index) + row = self.get_row_data(index, base_data, comparison_data) + if row: + rows_data.append(row) + return rows_data + + @staticmethod + def get_row_data(index, base_data, comparison_data): + if isclose(base_data[0], 0) and isclose(comparison_data[0], 0): + return [] + row_data = [index] + row_data.extend(base_data) + row_data.extend(comparison_data) + row_data.extend(calculate_diff_ratio(base_data[0], comparison_data[0])) + return row_data + + +class OverallMetricsInfo: + def __init__(self, profiling_info: ProfilingInfo): + self._profiling_info = profiling_info + self._overall_metrics_data_map = { + ExcelConfig.COMPUTING: self.computing_data, + ExcelConfig.FA: self.fa_data, + ExcelConfig.FA_FWD_CUBE: self.fa_fwd_cube_data, + ExcelConfig.FA_FWD_VECTOR: self.fa_fwd_vector_data, + ExcelConfig.FA_BWD_CUBE: self.fa_bwd_cube_data, + ExcelConfig.FA_BWD_VECTOR: self.fa_bwd_vector_data, + ExcelConfig.CONV: self.conv_data, + ExcelConfig.CONV_FWD_CUBE: self.conv_fwd_cube_data, + ExcelConfig.CONV_FWD_VECTOR: self.conv_fwd_vector_data, + ExcelConfig.CONV_BWD_CUBE: self.conv_bwd_cube_data, + ExcelConfig.CONV_BWD_VECTOR: self.conv_bwd_vector_data, + ExcelConfig.MM: self.mm_data, + ExcelConfig.MM_CUBE: self.mm_cube_data, + ExcelConfig.MM_VECTOR: self.mm_vector_data, + ExcelConfig.PA: self.pa_data, + ExcelConfig.VECTOR: self.vector_data, + ExcelConfig.VECTOR_TRANS: self.vector_trans_data, + ExcelConfig.VECTOR_NO_TRANS: self.vector_no_trans_data, + ExcelConfig.CUBE: self.cube_data, + ExcelConfig.SDMA_TM: self.sdma_tm_data, + ExcelConfig.OTHER: self.other_data, + ExcelConfig.COMMUNICATION_TIME: self.communication_data, + ExcelConfig.WAIT: self.wait_data, + ExcelConfig.TRANSMIT: self.transmit_data, + ExcelConfig.FREE_TIME: self.free_time_data, + ExcelConfig.SDMA: self.sdma_data, + ExcelConfig.FREE: self.free_data, + ExcelConfig.E2E_TIME: self.e2e_time_data + } + + @property + def overall_metrics(self): + return self._overall_metrics_data_map + + @property + def computing_data(self): + return [self._profiling_info.compute_time_ms, + self._profiling_info.compute_time_ms / self._profiling_info.e2e_time_ms, + sum((self._profiling_info.fa_total_num, self._profiling_info.conv_total_num, + self._profiling_info.mm_total_num, self._profiling_info.vector_total_num, + self._profiling_info.sdma_num_tensor_move, self._profiling_info.other_cube_num, + self._profiling_info.page_attention_num))] + + @property + def fa_data(self): + return [self._profiling_info.fa_total_time, + self._profiling_info.fa_total_time / self._profiling_info.e2e_time_ms, + self._profiling_info.fa_total_num] + + @property + def fa_fwd_cube_data(self): + return [self._profiling_info.fa_time_fwd_cube, + self._profiling_info.fa_time_fwd_cube / self._profiling_info.e2e_time_ms, + self._profiling_info.fa_num_fwd_cube] + + @property + def fa_fwd_vector_data(self): + return [self._profiling_info.fa_time_fwd_vector, + self._profiling_info.fa_time_fwd_vector / self._profiling_info.e2e_time_ms, + self._profiling_info.fa_num_fwd_vector] + + @property + def fa_bwd_cube_data(self): + return [self._profiling_info.fa_time_bwd_cube, + self._profiling_info.fa_time_bwd_cube / self._profiling_info.e2e_time_ms, + self._profiling_info.fa_num_bwd_cube] + + @property + def fa_bwd_vector_data(self): + return [self._profiling_info.fa_time_bwd_vector, + self._profiling_info.fa_time_bwd_vector / self._profiling_info.e2e_time_ms, + self._profiling_info.fa_num_bwd_vector] + + @property + def conv_data(self): + return [self._profiling_info.conv_total_time, + self._profiling_info.conv_total_time / self._profiling_info.e2e_time_ms, + self._profiling_info.conv_total_num] + + @property + def conv_fwd_cube_data(self): + return [self._profiling_info.conv_time_fwd_cube, + self._profiling_info.conv_time_fwd_cube / self._profiling_info.e2e_time_ms, + self._profiling_info.conv_num_fwd_cube] + + @property + def conv_fwd_vector_data(self): + return [self._profiling_info.conv_time_fwd_vector, + self._profiling_info.conv_time_fwd_vector / self._profiling_info.e2e_time_ms, + self._profiling_info.conv_num_fwd_vector] + + @property + def conv_bwd_cube_data(self): + return [self._profiling_info.conv_time_bwd_cube, + self._profiling_info.conv_time_bwd_cube / self._profiling_info.e2e_time_ms, + self._profiling_info.conv_num_bwd_cube] + + @property + def conv_bwd_vector_data(self): + return [self._profiling_info.conv_time_bwd_vector, + self._profiling_info.conv_time_bwd_vector / self._profiling_info.e2e_time_ms, + self._profiling_info.conv_num_bwd_vector] + + @property + def mm_data(self): + return [self._profiling_info.mm_total_time, + self._profiling_info.mm_total_time / self._profiling_info.e2e_time_ms, + self._profiling_info.mm_total_num] + + @property + def mm_cube_data(self): + return [self._profiling_info.matmul_time_cube, + self._profiling_info.matmul_time_cube / self._profiling_info.e2e_time_ms, + self._profiling_info.matmul_num_cube] + + @property + def mm_vector_data(self): + return [self._profiling_info.matmul_time_vector, + self._profiling_info.matmul_time_vector / self._profiling_info.e2e_time_ms, + self._profiling_info.matmul_num_vector] + + @property + def pa_data(self): + return [self._profiling_info.page_attention_time, + self._profiling_info.page_attention_time / self._profiling_info.e2e_time_ms, + self._profiling_info.page_attention_num] + + @property + def vector_data(self): + return [self._profiling_info.vector_total_time, + self._profiling_info.vector_total_time / self._profiling_info.e2e_time_ms, + self._profiling_info.vector_total_num] + + @property + def vector_trans_data(self): + return [self._profiling_info.vector_time_trans, + self._profiling_info.vector_time_trans / self._profiling_info.e2e_time_ms, + self._profiling_info.vector_num_trans] + + @property + def vector_no_trans_data(self): + return [self._profiling_info.vector_time_notrans, + self._profiling_info.vector_time_notrans / self._profiling_info.e2e_time_ms, + self._profiling_info.vector_num_notrans] + + @property + def cube_data(self): + return [self._profiling_info.other_cube_time, + self._profiling_info.other_cube_time / self._profiling_info.e2e_time_ms, + self._profiling_info.other_cube_num] + + @property + def sdma_tm_data(self): + return [self._profiling_info.sdma_time_tensor_move, + self._profiling_info.sdma_time_tensor_move / self._profiling_info.e2e_time_ms, + self._profiling_info.sdma_num_tensor_move] + + @property + def other_data(self): + other_time = max((0, + self._profiling_info.compute_time_ms - self._profiling_info.fa_total_time - + self._profiling_info.conv_total_time - self._profiling_info.mm_total_time - + self._profiling_info.vector_total_time - self._profiling_info.sdma_time_tensor_move - + self._profiling_info.other_cube_time - self._profiling_info.page_attention_time)) + return [other_time, other_time / self._profiling_info.e2e_time_ms, "/"] + + @property + def communication_data(self): + return [self._profiling_info.communication_not_overlapped_ms, + self._profiling_info.communication_not_overlapped_ms / self._profiling_info.e2e_time_ms, "/"] + + @property + def wait_data(self): + return [self._profiling_info.wait_time_ms, + self._profiling_info.wait_time_ms / self._profiling_info.e2e_time_ms, "/"] + + @property + def transmit_data(self): + return [self._profiling_info.transmit_time_ms, + self._profiling_info.transmit_time_ms / self._profiling_info.e2e_time_ms, "/"] + + @property + def free_time_data(self): + return [self._profiling_info.free_time_ms, + self._profiling_info.free_time_ms / self._profiling_info.e2e_time_ms, "/"] + + @property + def sdma_data(self): + return [self._profiling_info.sdma_time_stream, + self._profiling_info.sdma_time_stream / self._profiling_info.e2e_time_ms, "/"] + + @property + def free_data(self): + free = self._profiling_info.free_time_ms - self._profiling_info.sdma_time_stream + return [free, free / self._profiling_info.e2e_time_ms, "/"] + + @property + def e2e_time_data(self): + return [self._profiling_info.e2e_time_ms, 1, "/"] diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py index e5d9bf26e98..e0a80a4d30d 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py +++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py @@ -37,6 +37,105 @@ class ProfilingInfo: self.hide_op_details = False self.is_level0 = False + # 性能拆解新指标 + self.fa_time_fwd_cube = 0.0 + self.fa_num_fwd_cube = 0 + self.fa_time_bwd_cube = 0.0 + self.fa_num_bwd_cube = 0 + self.fa_time_fwd_vector = 0.0 + self.fa_num_fwd_vector = 0 + self.fa_time_bwd_vector = 0.0 + self.fa_num_bwd_vector = 0 + + self.conv_time_fwd_cube = 0.0 + self.conv_num_fwd_cube = 0 + self.conv_time_bwd_cube = 0.0 + self.conv_num_bwd_cube = 0 + self.conv_time_fwd_vector = 0.0 + self.conv_num_fwd_vector = 0 + self.conv_time_bwd_vector = 0.0 + self.conv_num_bwd_vector = 0 + + self.matmul_time_cube = 0.0 + self.matmul_num_cube = 0 + self.matmul_time_vector = 0.0 + self.matmul_num_vector = 0 + + self.page_attention_time = 0.0 + self.page_attention_num = 0 + + self.vector_time_trans = 0.0 + self.vector_num_trans = 0 + self.vector_time_notrans = 0.0 + self.vector_num_notrans = 0 + + self.sdma_time_tensor_move = 0.0 + self.sdma_num_tensor_move = 0 + self.sdma_time_stream = 0.0 + self.sdma_num_stream = 0 + + self.other_cube_time = 0.0 + self.other_cube_num = 0 + + @property + def e2e_time_ms(self): + return self.e2e_time * 10 ** 3 + + @property + def compute_time_ms(self): + return self.compute_time * 10 ** 3 + + @property + def free_time_ms(self): + return self.scheduling_time * 10 ** 3 + + @property + def communication_not_overlapped_ms(self): + return self.communication_not_overlapped * 10 ** 3 + + @property + def wait_time_ms(self): + return self.wait_time * 10 ** 3 + + @property + def transmit_time_ms(self): + return (self.communication_not_overlapped - self.wait_time) * 10 ** 3 + + @property + def fa_total_time(self): + return sum((self.fa_time_fwd_cube, self.fa_time_fwd_vector, self.fa_time_bwd_cube, self.fa_time_bwd_vector)) + + @property + def fa_total_num(self): + return sum((self.fa_num_fwd_cube, self.fa_num_fwd_vector, self.fa_num_bwd_cube, self.fa_num_bwd_vector)) + + @property + def conv_total_time(self): + return sum( + (self.conv_time_fwd_cube, self.conv_time_fwd_vector, self.conv_time_bwd_cube, + self.conv_time_bwd_vector)) + + @property + def conv_total_num(self): + return sum((self.conv_num_fwd_cube, self.conv_num_fwd_vector, self.conv_num_bwd_cube, + self.conv_num_bwd_vector)) + + @property + def mm_total_time(self): + return sum((self.matmul_time_cube, self.matmul_time_vector)) + + @property + def mm_total_num(self): + return sum((self.matmul_num_cube, self.matmul_num_vector)) + + @property + def vector_total_time(self): + return sum((self.vector_time_trans, self.vector_time_notrans)) + + @property + def vector_total_num(self): + return sum((self.vector_num_trans, self.vector_num_notrans)) + def trans_time_to_s(self): self.cube_time = self.cube_time / 10 ** 6 self.other_time = self.other_time / 10 ** 6 @@ -54,6 +153,24 @@ class ProfilingInfo: self.conv_time_fwd = self.conv_time_fwd / 10 ** 6 self.conv_time_bwd = self.conv_time_bwd / 10 ** 6 + # 新指标单位为ms + self.fa_time_fwd_cube /= 10 ** 3 + self.fa_time_bwd_cube /= 10 ** 3 + self.fa_time_fwd_vector /= 10 ** 3 + self.fa_time_bwd_vector /= 10 ** 3 + self.conv_time_fwd_cube /= 10 ** 3 + self.conv_time_bwd_cube /= 10 ** 3 + self.conv_time_fwd_vector /= 10 ** 3 + self.conv_time_bwd_vector /= 10 ** 3 + self.matmul_time_cube /= 10 ** 3 + self.matmul_time_vector /= 10 ** 3 + self.vector_time_trans /= 10 ** 3 + self.vector_time_notrans /= 10 ** 3 + self.sdma_time_tensor_move /= 10 ** 3 + self.sdma_time_stream /= 10 ** 3 + self.page_attention_time /= 10 ** 3 + self.other_cube_time /= 10 ** 3 + def calculate_other_time(self): self.other_time = max( [0, self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd - @@ -64,8 +181,7 @@ class ProfilingInfo: - self.conv_time_fwd - self.conv_time_bwd def calculate_schedule_time(self): - self.scheduling_time = (self.e2e_time - self.compute_time - self.lccl_time \ - - self.communication_not_overlapped) + self.scheduling_time = (self.e2e_time - self.compute_time - self.lccl_time - self.communication_not_overlapped) def update_fa_fwd_info(self, time: float): self.fa_time_fwd += time @@ -75,6 +191,30 @@ class ProfilingInfo: self.fa_time_bwd += time self.fa_num_bwd += 1 + def update_fa_fwd_cube_info(self, time: float): + self.fa_time_fwd_cube += time + self.fa_num_fwd_cube += 1 + + def update_fa_bwd_cube_info(self, time: float): + self.fa_time_bwd_cube += time + self.fa_num_bwd_cube += 1 + + def update_fa_fwd_vector_info(self, time: float): + self.fa_time_fwd_vector += time + self.fa_num_fwd_vector += 1 + + def update_fa_bwd_vector_info(self, time: float): + self.fa_time_bwd_vector += time + self.fa_num_bwd_vector += 1 + + def update_sdma_tensor_move_info(self, time: float): + self.sdma_time_tensor_move += time + self.sdma_num_tensor_move += 1 + + def update_sdma_stream_info(self, time: float, num: int = 1): + self.sdma_time_stream += time + self.sdma_num_stream += num + def update_pa_info(self, time: float): self.pa_time += time self.pa_num += 1 @@ -91,6 +231,42 @@ class ProfilingInfo: self.conv_time_bwd += time self.conv_num_bwd += 1 + def update_conv_bwd_cube_info(self, time: float): + self.conv_time_bwd_cube += time + self.conv_num_bwd_cube += 1 + + def update_conv_fwd_cube_info(self, time: float): + self.conv_time_fwd_cube += time + self.conv_num_fwd_cube += 1 + + def update_conv_bwd_vector_info(self, time: float): + self.conv_time_bwd_vector += time + self.conv_num_bwd_vector += 1 + + def update_conv_fwd_vector_info(self, time: float): + self.conv_time_fwd_vector += time + self.conv_num_fwd_vector += 1 + + def update_matmul_cube_info(self, time: float): + self.matmul_time_cube += time + self.matmul_num_cube += 1 + + def update_matmul_vector_info(self, time: float): + self.matmul_time_vector += time + self.matmul_num_vector += 1 + + def update_page_attention_info(self, time: float): + self.page_attention_time += time + self.page_attention_num += 1 + + def update_vector_trans_info(self, time: float): + self.vector_time_trans += time + self.vector_num_trans += 1 + + def update_vector_notrans_info(self, time: float): + self.vector_time_notrans += time + self.vector_num_notrans += 1 + def update_sdma_info(self, time: float, num: int = 1): self.sdma_time += time self.sdma_num += num @@ -103,6 +279,10 @@ class ProfilingInfo: self.vec_time += time self.vec_num += 1 + def update_other_cube_info(self, time: float): + self.other_cube_time += time + self.other_cube_num += 1 + def set_compute_time(self, time: float): self.compute_time = time diff --git a/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py index fdce23c6ab4..3106527c419 100644 --- a/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py +++ b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py @@ -17,3 +17,20 @@ class OperatorDataPrepare: else: result_data.append(level1_node) return result_data + + def get_all_layer_ops(self) -> any: + root_node = TreeBuilder.build_tree(self.profiling_data.torch_op_data, [], []) + level1_child_nodes = root_node.child_nodes + node_queue = [] + result_data = [] + for level1_node in level1_child_nodes: + if level1_node.is_step_profiler(): + node_queue.extend(level1_node.child_nodes) + else: + node_queue.append(level1_node) + while len(node_queue) > 0: + node = node_queue.pop(0) + result_data.append(node) + if node.child_nodes: + node_queue.extend(node.child_nodes) + return result_data \ No newline at end of file diff --git a/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py b/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py index c89e8451930..7bac2b03353 100644 --- a/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py +++ b/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py @@ -31,4 +31,30 @@ class OverallPerfInterface: def _generate_result(self): overall_data = self._profiling_data.overall_metrics - self._result_data = getattr(overall_data, "__dict__", {}) + + self._result_data = { + "profiling_type": overall_data.profiling_type, + "minimal_profiling": overall_data.minimal_profiling, + "overall": {"e2e_time_ms": overall_data.e2e_time_ms, + "computing_time_ms": overall_data.compute_time_ms, + "uncovered_communication_time_ms": overall_data.communication_not_overlapped_ms, + "free_time_ms": overall_data.free_time_ms}, + "computing_time_disaggregate": {"fa_time_ms": overall_data.fa_total_time, + "conv_time_ms": overall_data.conv_total_time, + "matmul_time_ms": overall_data.mm_total_time, + "page_attention_time_ms": overall_data.page_attention_time, + "vector_time_ms": overall_data.vector_total_time, + "tensor_move_time_ms": overall_data.sdma_time_tensor_move, + "other_cube_time_ms": overall_data.other_cube_time}, + "computing_num_disaggregate": {"fa_num": overall_data.fa_total_num, + "conv_num": overall_data.conv_total_num, + "matmul_num": overall_data.mm_total_num, + "page_attention_num": overall_data.page_attention_num, + "vector_num": overall_data.vector_total_num, + "tensor_move_num": overall_data.sdma_num_tensor_move, + "other_cube_num": overall_data.other_cube_num}, + "communication_time_disaggregate": {"wait_time_ms": overall_data.wait_time_ms, + "transmit_time_ms": overall_data.transmit_time_ms}, + "free_time_disaggregate": {"sdma_time_ms": overall_data.sdma_time_stream, + "free_ms": overall_data.free_time_ms - overall_data.sdma_time_stream} + } diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py index 5b93d888a4b..6fe693fb067 100644 --- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py +++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py @@ -8,6 +8,9 @@ from compare_backend.comparator.module_comparetor import ModuleComparator from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator from compare_backend.comparator.operator_comparator import OperatorComparator from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator +from compare_backend.comparator.api_compare_comparator import ApiCompareComparator +from compare_backend.comparator.kernel_compare_comparator import KernelCompareComparator +from compare_backend.comparator.overall_metrics_comparator import OverallMetricsComparator from compare_backend.compare_bean.communication_bean import CommunicationBean from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean from compare_backend.compare_bean.memory_statistic_bean import MemoryStatisticBean @@ -15,6 +18,9 @@ from compare_backend.compare_bean.module_compare_bean import ModuleCompareBean from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean +from compare_backend.compare_bean.api_compare_bean import ApiCompareBean +from compare_backend.compare_bean.kernel_compare_bean import KernelCompareBean +from compare_backend.compare_bean.overall_metrics_bean import OverallMetricsBean from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare from compare_backend.generator.base_generator import BaseGenerator @@ -37,12 +43,22 @@ class DetailPerformanceGenerator(BaseGenerator): return op_compare_result def compare(self): - if self._args.enable_operator_compare or self._args.enable_memory_compare or \ - self._args.enable_communication_compare: + enable_compare = [self._args.enable_operator_compare, self._args.enable_memory_compare, + self._args.enable_communication_compare, self._args.enable_api_compare, + self._args.enable_kernel_compare] + if any(enable_compare): print("[INFO] Start to compare performance detail data, please wait.") comparator_list = self._create_comparator() - for comparator in comparator_list: - self._result_data.update(comparator.generate_data()) + else: + comparator_list = [] + if self._args.enable_profiling_compare: + overall_data = {Constant.BASE_DATA: self._profiling_data_dict.get(Constant.BASE_DATA).overall_metrics, + Constant.COMPARISON_DATA: self._profiling_data_dict.get( + Constant.COMPARISON_DATA).overall_metrics} + # overall 数据在最前面 + comparator_list.insert(0, OverallMetricsComparator(overall_data, OverallMetricsBean)) + for comparator in comparator_list: + self._result_data.update(comparator.generate_data()) def generate_view(self): if not self._result_data: @@ -57,6 +73,7 @@ class DetailPerformanceGenerator(BaseGenerator): comparator_list = [] op_compare_result = [] + if self._args.enable_operator_compare: module_compare_result = self.match_nn_module() if self._profiling_data_dict.get( Constant.BASE_DATA).python_function_data and self._profiling_data_dict.get( @@ -86,6 +103,18 @@ class DetailPerformanceGenerator(BaseGenerator): comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean)) if not self._args.disable_details: comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean)) + if self._args.enable_api_compare: + api_compare_result = { + Constant.BASE_DATA: OperatorDataPrepare( + self._profiling_data_dict.get(Constant.BASE_DATA)).get_all_layer_ops(), + Constant.COMPARISON_DATA: OperatorDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).get_all_layer_ops()} + comparator_list.append(ApiCompareComparator(api_compare_result, ApiCompareBean)) + if self._args.enable_kernel_compare: + kernel_compare_result = { + Constant.BASE_DATA: self._profiling_data_dict.get(Constant.BASE_DATA).kernel_details, + Constant.COMPARISON_DATA: self._profiling_data_dict.get(Constant.COMPARISON_DATA).kernel_details} + comparator_list.append(KernelCompareComparator(kernel_compare_result, KernelCompareBean)) return comparator_list def match_torch_op(self) -> list: diff --git a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py index 2127ff5e75e..9daaa55ef16 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py @@ -2,6 +2,7 @@ from abc import abstractmethod, ABC from decimal import Decimal from compare_backend.compare_bean.origin_data_bean.compare_event import KernelEvent, MemoryEvent +from compare_backend.compare_bean.origin_data_bean.kernel_details_bean import KernelDetailsBean from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean from compare_backend.compare_bean.profiling_info import ProfilingInfo from compare_backend.utils.constant import Constant @@ -19,6 +20,7 @@ class ProfilingResult: self.overall_metrics = ProfilingInfo(profiling_type) self.python_function_data = [] self.fwdbwd_dict = {} + self.kernel_details = {} def update_torch_op_data(self, event: TraceEventBean): event.is_torch_op = True @@ -42,6 +44,9 @@ class ProfilingResult: def update_comm_task_data(self, comm_name: str, task_event: TraceEventBean): self.communication_dict.setdefault(comm_name, {}).setdefault("comm_task", {}).setdefault( task_event.name, []).append(task_event.dur) + + def update_kernel_details(self, kernels: dict): + self.kernel_details = kernels class BaseProfilingParser(ABC): @@ -56,6 +61,8 @@ class BaseProfilingParser(ABC): self._enable_operator_compare = args.enable_operator_compare self._enable_memory_compare = args.enable_memory_compare self._enable_communication_compare = args.enable_communication_compare + self._enable_api_compare = args.enable_api_compare + self._enable_kernel_compare = args.enable_kernel_compare self._dispatch_func = self._get_dispatch_func() self._result_data = ProfilingResult(self._profiling_type) self._memory_events = [] @@ -66,6 +73,22 @@ class BaseProfilingParser(ABC): self._comm_list = [] self._read_trace_event() self._cur_func_index = 0 + self._categorize_performance_index = 0 + self._cpu_cube_op = None + self._bwd_tid = None + + @property + def cpu_cube_op(self): + if self._cpu_cube_op is not None: + return self._cpu_cube_op + cpu_cube_op = [op for op in self._result_data.torch_op_data if op.is_cpu_cube_op()] + cpu_cube_op.sort(key=lambda x: x.start_time) + self._cpu_cube_op = cpu_cube_op + return self._cpu_cube_op + + @abstractmethod + def _update_kernel_details(self): + raise NotImplementedError("Function _update_kernel_details need to be implemented.") @abstractmethod def _update_memory_list(self): @@ -99,9 +122,95 @@ class BaseProfilingParser(ABC): self._update_memory_list() if self._enable_profiling_compare: self._update_overall_metrics() + if self._enable_kernel_compare: + self._update_kernel_details() self._check_result_data() return self._result_data + def categorize_computing_performance_data(self, tk: (TraceEventBean, KernelDetailsBean), flow_dict_new: dict): + if tk.is_page_attention(): + self._result_data.overall_metrics.update_page_attention_info(tk.dur) + return + if tk.is_sdma(): + self._result_data.overall_metrics.update_sdma_tensor_move_info(tk.dur) + return + flow_start_time = flow_dict_new.get(tk.start_time) + if flow_start_time: + while self._categorize_performance_index < len(self.cpu_cube_op): + cur_op = self.cpu_cube_op[self._categorize_performance_index] + if cur_op.end_time < flow_start_time: + self._categorize_performance_index += 1 + continue + if cur_op.start_time <= flow_start_time: + self._categorize_cube_performance_data(cur_op, tk) + return + break + if self._profiling_type == Constant.NPU: + # 缺失torch至npu连线的算子,判断fa/conv/matmul使用kernel_details.csv的op_type字段 + if tk.is_flash_attention(): + if tk.is_fa_bwd(): + self._result_data.overall_metrics.update_fa_bwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_fa_fwd_cube_info(tk.dur) + return + elif tk.is_conv(): + if tk.is_conv_bwd(): + self._result_data.overall_metrics.update_conv_bwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_conv_fwd_cube_info(tk.dur) + return + elif tk.is_matmul(): + self._result_data.overall_metrics.update_matmul_cube_info(tk.dur) + return + if tk.is_cube_kernel_cat(): + self._result_data.overall_metrics.update_other_cube_info(tk.dur) + elif tk.is_trans(): + self._result_data.overall_metrics.update_vector_trans_info(tk.dur) + else: + self._result_data.overall_metrics.update_vector_notrans_info(tk.dur) + + def _categorize_cube_performance_data(self, cpu_op: TraceEventBean, tk: (TraceEventBean, KernelDetailsBean)): + """ + 判断fa/conv/matmul/vector使用cpu_op + """ + if cpu_op.is_fa_for_cpu_op(): + if self._is_backward(cpu_op): + if tk.is_cube_kernel_cat(): + self._result_data.overall_metrics.update_fa_bwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_fa_bwd_vector_info(tk.dur) + else: + if tk.is_cube_kernel_cat(): + self._result_data.overall_metrics.update_fa_fwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_fa_fwd_vector_info(tk.dur) + elif cpu_op.is_conv_for_cpu_op(): + if self._is_backward(cpu_op): + if tk.is_cube_kernel_cat(): + self._result_data.overall_metrics.update_conv_bwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_conv_bwd_vector_info(tk.dur) + else: + if tk.is_cube_kernel_cat(): + self._result_data.overall_metrics.update_conv_fwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_conv_fwd_vector_info(tk.dur) + elif cpu_op.is_matmul_for_cpu_op(): # matmul + if tk.is_cube_kernel_cat(): + self._result_data.overall_metrics.update_matmul_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_matmul_vector_info(tk.dur) + + def _is_backward(self, event: TraceEventBean): + return event.tid == self._bwd_tid or event.is_bwd_for_cpu_op() + + def _get_flow_time_dict(self): + return { + flow_event["end"].start_time: flow_event["start"].start_time + for flow_event in self._flow_dict.values() + if flow_event.get("end") and flow_event.get("start") + } + def _dispatch_events(self): if not self._dispatch_func: return @@ -194,7 +303,7 @@ class BaseProfilingParser(ABC): task_index += 1 def _check_result_data(self): - if self._enable_operator_compare or self._enable_memory_compare: + if self._enable_operator_compare or self._enable_memory_compare or self._enable_api_compare: if not self._result_data.torch_op_data: print(f"[WARNING] Can't find any torch op in the file: {self._profiling_path}") if self._enable_operator_compare and not self._result_data.kernel_dict: @@ -203,6 +312,11 @@ class BaseProfilingParser(ABC): print(f"[WARNING] Can't find any memory event in the file: {self._profiling_path}") if self._enable_communication_compare and not self._result_data.communication_dict: print(f"[WARNING] Can't find any communication op in the file: {self._profiling_path}") + if self._enable_kernel_compare and not self._result_data.kernel_details: + if self._profiling_type == Constant.GPU: + print(f"[WARNING] kernel compare between GPU data and NPU data is not supported.") + else: + print(f"[WARNING] Can't find any kernel details in the file: {self._profiling_path}") def _read_trace_event(self): try: diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py index c4089aec9bd..0aeeba83efb 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -20,6 +20,7 @@ class GPUProfilingParser(BaseProfilingParser): self._compute_stream_id = self._infer_compute_stream_id() self._marks = defaultdict(int) self._aten_index = 0 + self._find_bwd_tid() @classmethod def __is_flash_attention(cls, name: str): @@ -30,10 +31,10 @@ class GPUProfilingParser(BaseProfilingParser): @classmethod def __is_sdma_time(cls, name: str): - for mark in cls.SDMA_MARK_LIST: - if mark in name.lower(): - return True - return False + return any(mask in name.lower() for mask in cls.SDMA_MARK_LIST) + + def _update_kernel_details(self): + pass def _update_memory_list(self): if not self._enable_memory_compare: @@ -68,19 +69,15 @@ class GPUProfilingParser(BaseProfilingParser): min_ts = sys.float_info.max max_ts = sys.float_info.min self._trace_events.sort(key=lambda x: x.start_time) - aten_events = list(filter(lambda x: x.name.startswith("aten::"), self._trace_events)) - flow_dict_new = {} - for flow_event in self._flow_dict.values(): - start_event = flow_event.get("start") - end_event = flow_event.get("end") - if start_event and end_event: - flow_dict_new[end_event.start_time] = start_event.start_time + aten_events = [event for event in self._trace_events if event.name.startswith("aten::")] + flow_dict_new = self._get_flow_time_dict() for event in self._trace_events: if event.stream: min_ts = min(event.start_time, min_ts) max_ts = max(event.end_time, max_ts) if event.stream == self._compute_stream_id and self.__is_sdma_time(event.name): self._result_data.overall_metrics.update_sdma_info(event.dur) + self._result_data.overall_metrics.update_sdma_stream_info(event.dur) continue if not event.is_kernel_cat(): continue @@ -88,6 +85,7 @@ class GPUProfilingParser(BaseProfilingParser): if event.is_nccl_name(): continue self.__add_compute_time(event, aten_events, flow_dict_new) + self.categorize_computing_performance_data(event, flow_dict_new) self._aten_events = None self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) self.__add_compute_and_overlap_time() @@ -162,7 +160,7 @@ class GPUProfilingParser(BaseProfilingParser): def _get_dispatch_func(self): func_set = set() - if self._enable_memory_compare or self._enable_operator_compare: + if self._enable_memory_compare or self._enable_operator_compare or self._enable_profiling_compare: func_set.add(self._picking_torch_op_event) if self._enable_communication_compare: func_set.add(self._picking_kernel_event) @@ -174,6 +172,10 @@ class GPUProfilingParser(BaseProfilingParser): func_set.add(self._picking_flow_event) if self._enable_memory_compare or self._enable_profiling_compare: func_set.add(self._picking_memory_event) + if self._enable_profiling_compare: + func_set.add(self._picking_flow_event) + if self._enable_api_compare: + func_set.add(self._picking_torch_op_event) return list(func_set) def _infer_compute_stream_id(self): @@ -187,3 +189,9 @@ class GPUProfilingParser(BaseProfilingParser): raise RuntimeError('[ERROR] The profiling data does not contain kernel running data.') counter = Counter(kernel_stream_ids) return counter.most_common(1)[0][0] + + def _find_bwd_tid(self): + for event in self._trace_events: + if event.is_fwdbwd() and event.is_flow_end(): + self._bwd_tid = event.tid + break diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py index 70ce44b44eb..cb25c252c6c 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py @@ -36,7 +36,7 @@ class NPUProfilingParser(BaseProfilingParser): def _get_dispatch_func(self): func_list = set() - if self._enable_memory_compare or self._enable_operator_compare: + if self._enable_memory_compare or self._enable_operator_compare or self._enable_profiling_compare: func_list.add(self._picking_torch_op_event) if self._enable_operator_compare or self._args.max_kernel_num: func_list.add(self._picking_kernel_event) @@ -52,8 +52,33 @@ class NPUProfilingParser(BaseProfilingParser): func_list.add(self._picking_overlap_analysis_data) func_list.add(self._picking_kernel_event) func_list.add(self._picking_hccl_event) + func_list.add(self._picking_flow_event) + if self._enable_api_compare: + func_list.add(self._picking_torch_op_event) return list(func_list) + def _update_kernel_details(self): + try: + kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) + except FileNotFoundError: + print("[WARNING] The file kernel_details.csv does not exist.") + except Exception: + print("[ERROR] Failed to read kernel_details.csv.") + return + if not kernel_details: + return + kernels_dict = {} + for kernel in kernel_details: + if kernel.is_invalid(): + continue + input_shapes = kernel.input_shapes if kernel.input_shapes else 'N/A' + kernels_dict.setdefault(kernel.op_type, {}).setdefault(input_shapes, []).append( + [kernel.name, kernel.duration]) + if len(kernels_dict) == 1: + print("[ERROR] Failed to enable enable_kernel_compare, type of kernel_details.csv is null.") + return + self._result_data.update_kernel_details(kernels_dict) + def _update_memory_list(self): try: memory_data = FileReader.read_csv_file(self._operator_memory_path, OperatorMemoryBean) @@ -205,6 +230,8 @@ class NPUProfilingParser(BaseProfilingParser): def _filter_meta_id(self): for event in self._trace_events: + if event.is_fwdbwd() and event.is_flow_end(): + self._bwd_tid = event.tid if not event.is_process_meta(): continue if event.is_hccl_process_name(): @@ -244,17 +271,7 @@ class NPUProfilingParser(BaseProfilingParser): self._result_data.overall_metrics.update_lccl_info(event.dur) def __parse_kernel_csv(self): - try: - kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) - except Exception: - print('[WARNING] Npu kernel details csv file is not available.') - return - if not kernel_details or kernel_details[0].is_hide_op_pmu(): - self._result_data.overall_metrics.hide_op_details = True - return - for kernel in kernel_details: - if kernel.is_invalid(): - continue + def __screen_data(kernel: KernelDetailsBean): if kernel.is_flash_attention(): if kernel.is_fa_bwd(): self._result_data.overall_metrics.update_fa_bwd_info(kernel.duration) @@ -265,7 +282,7 @@ class NPUProfilingParser(BaseProfilingParser): self._result_data.overall_metrics.update_conv_bwd_info(kernel.duration) else: self._result_data.overall_metrics.update_conv_fwd_info(kernel.duration) - elif kernel.is_cube(): + elif kernel.is_matmul(): self._result_data.overall_metrics.update_cube_info(kernel.duration) elif kernel.is_sdma(): self._result_data.overall_metrics.update_sdma_info(kernel.duration) @@ -276,6 +293,22 @@ class NPUProfilingParser(BaseProfilingParser): else: self._result_data.overall_metrics.update_cube_info(kernel.duration) + try: + kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) + except Exception: + print('[WARNING] Npu kernel details csv file is not available.') + return + if not kernel_details or kernel_details[0].is_hide_op_pmu(): + self._result_data.overall_metrics.hide_op_details = True + return + flow_dict_new = self._get_flow_time_dict() + kernel_details.sort(key=lambda x: x.start_time) + for kernel in kernel_details: + if kernel.is_invalid(): + continue + __screen_data(kernel) + self.categorize_computing_performance_data(kernel, flow_dict_new) + def __parse_mem_csv(self): try: memory_record = FileReader.read_csv_file(self._memory_record_path, MemoryRecordBean) @@ -321,3 +354,4 @@ class NPUProfilingParser(BaseProfilingParser): for stream in compute_stream: dur_list = sdma_dict.get(stream, []) self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list)) + self._result_data.overall_metrics.update_sdma_stream_info(sum(dur_list), len(dur_list)) diff --git a/profiler/compare_tools/compare_backend/utils/args_manager.py b/profiler/compare_tools/compare_backend/utils/args_manager.py index 4b5947fa7bc..ab9fb43a968 100644 --- a/profiler/compare_tools/compare_backend/utils/args_manager.py +++ b/profiler/compare_tools/compare_backend/utils/args_manager.py @@ -69,6 +69,14 @@ class ArgsManager: def enable_communication_compare(self): return self._args.enable_communication_compare + @property + def enable_api_compare(self): + return self._args.enable_api_compare + + @property + def enable_kernel_compare(self): + return self._args.enable_kernel_compare + @classmethod def check_profiling_path(cls, file_path: str): PathManager.input_path_common_check(file_path) @@ -119,11 +127,14 @@ class ArgsManager: raise RuntimeError(msg) if not any([self._args.enable_profiling_compare, self._args.enable_operator_compare, - self._args.enable_memory_compare, self._args.enable_communication_compare]): + self._args.enable_memory_compare, self._args.enable_communication_compare, + self._args.enable_api_compare, self._args.enable_kernel_compare]): self._args.enable_profiling_compare = True self._args.enable_operator_compare = True self._args.enable_memory_compare = True self._args.enable_communication_compare = True + self._args.enable_api_compare = True + self._args.enable_kernel_compare = True base_profiling_path = PathManager.get_realpath(self._args.base_profiling_path) self.check_profiling_path(base_profiling_path) diff --git a/profiler/compare_tools/compare_backend/utils/compare_args.py b/profiler/compare_tools/compare_backend/utils/compare_args.py index ab9bc364f44..9e6291e89e0 100644 --- a/profiler/compare_tools/compare_backend/utils/compare_args.py +++ b/profiler/compare_tools/compare_backend/utils/compare_args.py @@ -6,6 +6,8 @@ class Args: enable_operator_compare: bool = False, enable_memory_compare: bool = False, enable_communication_compare: bool = False, + enable_api_compare: bool = False, + enable_kernel_compare: bool = False, output_path: str = "", max_kernel_num: int = None, op_name_map: dict = {}, @@ -17,6 +19,8 @@ class Args: self.enable_operator_compare = enable_operator_compare self.enable_memory_compare = enable_memory_compare self.enable_communication_compare = enable_communication_compare + self.enable_api_compare = enable_api_compare + self.enable_kernel_compare = enable_kernel_compare self.output_path = output_path self.max_kernel_num = max_kernel_num self.op_name_map = op_name_map diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py index e2854692ae3..252aa536e1c 100644 --- a/profiler/compare_tools/compare_backend/utils/constant.py +++ b/profiler/compare_tools/compare_backend/utils/constant.py @@ -11,6 +11,7 @@ class Constant(object): GREEN_COLOR = "00FF00" RED_COLOR = "FF0000" BLUE_COLOR = "00BFFF" + LIGHT_BLUE_COLOR = "87CEFA" US_TO_MS = 1000 KB_TO_MB = 1024 INVALID_VALUE = -1 @@ -38,13 +39,16 @@ class Constant(object): # compare type OPERATOR_COMPARE = "OperatorCompare" MEMORY_COMPARE = "MemoryCompare" - + API_COMPARE = "ApiCompare" + KERNEL_COMPARE = "KernelCompare" # sheet name OPERATOR_SHEET = "OperatorCompare" MEMORY_SHEET = "MemoryCompare" OPERATOR_TOP_SHEET = "OperatorCompareStatistic" MEMORY_TOP_SHEET = "MemoryCompareStatistic" COMMUNICATION_SHEET = "CommunicationCompare" + API_SHEET = "ApiCompare" + KERNEL_SHEET = "KernelCompare" # table name OPERATOR_TABLE = "OperatorCompare" @@ -55,6 +59,9 @@ class Constant(object): PERFORMANCE_TABLE = "Model Profiling Time Distribution" MODULE_TABLE = "ModuleCompare" MODULE_TOP_TABLE = "ModuleCompareStatistic" + OVERALL_METRICS_TABLE = "OverallMetrics" + API_TABLE = "ApiCompare" + KERNEL_TABLE = "KernelCompare" # memory SIZE = "Size(KB)" @@ -78,3 +85,9 @@ class Constant(object): OVERALL_COMPARE = "overall" BWD_LIST = ["bwd", "backward", "back"] + + CPU_OP_FA_MASK = ("flash_attention", "fusion_attention", "flashattn", "xformers_flash", "efficient_attention") + CPU_OP_CONV = "aten::conv" + CPU_OP_MATMUL_MASK = ("aten::addmm", "aten::bmm", "aten::mm", "aten::matmul") + KERNEL_CUBE_MASK = ("gemm", "conv", "cutlass", "wgrad") + KERNEL_TRANS_MASK = ("cast", "transdata", "transpose") diff --git a/profiler/compare_tools/compare_backend/utils/excel_config.py b/profiler/compare_tools/compare_backend/utils/excel_config.py index 306abcdfec6..b6be0ae2ebc 100644 --- a/profiler/compare_tools/compare_backend/utils/excel_config.py +++ b/profiler/compare_tools/compare_backend/utils/excel_config.py @@ -18,6 +18,8 @@ class CellFormatType: 'valign': 'vcenter', 'bold': True, 'border': True} # 绿色背景,加粗 YELLOW_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.YELLOW_COLOR, 'align': 'left', 'valign': 'vcenter', 'bold': True, 'border': True} # 黄色背景,加粗 + BLUE_NORMAL = {'fg_color': Constant.BLUE_COLOR} # 蓝色背景,主要用于行样式 + LIGHT_BLUE_NORMAL = {'fg_color': Constant.LIGHT_BLUE_COLOR} # 淡蓝色背景,主要用于行样式 class ExcelConfig(object): @@ -55,7 +57,7 @@ class ExcelConfig(object): DEVICE_SELF_TIME = "Device Self Time(ms)" DEVICE_TOTAL_TIME = "Device Total Time(ms)" DIFF_SELF_TIME = "Device Self Time Diff(ms)" - DIFF_TOTAL_RATIO = "Total Diff Ratio" + DIFF_TOTAL_RATIO = "Diff Total Ratio" DIFF_TOTAL_TIME = "Device Total Time Diff(ms)" DEVICE_SELF_TIME_US = "Device Self Time(us)" DEVICE_TOTAL_TIME_US = "Device Total Time(us)" @@ -65,6 +67,18 @@ class ExcelConfig(object): MODULE_LEVEL = "Module Level" BASE_CALL_STACK = "Base Call Stack" COMPARISON_CALL_STACK = "Comparison Call Stack" + INDEX = "Index" + DURATION = "Duration(ms)" + DURATION_RATIO = "Duration Ratio" + DIFF_DUR_MS = "Diff Duration(ms)" + API_NAME = "api name" + TOTAL_DURATION_MS = "Total Duration(ms)" + AVG_DURATION_MS = "Avg Duration(ms)" + SELF_TIME_MS = "Self Time(ms)" + DIFF_SELF_RATIO = "Diff Self Ratio" + DIFF_AVG_RATIO = "Diff Avg Ratio" + DIFF_CALLS_RATIO = "Diff Calls Ratio" + KERNEL = "Kernel" HEADERS = { Constant.OPERATOR_TABLE: [ @@ -176,10 +190,115 @@ class ExcelConfig(object): {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} + ], + Constant.OVERALL_METRICS_TABLE: [ + {"name": INDEX, "type": CellFormatType.DEFAULT, "width": 40}, + {"name": DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DURATION_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DURATION_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DIFF_DUR_MS, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 10}, + ], + Constant.API_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": API_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": TOTAL_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": SELF_TIME_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": AVG_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20}, + {"name": TOTAL_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": SELF_TIME_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": AVG_DURATION_MS, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": DIFF_SELF_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": DIFF_AVG_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": DIFF_CALLS_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + ], + Constant.KERNEL_COMPARE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": KERNEL, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT,"width": 20}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT,"width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, + {"name": DIFF_AVG_RATIO, "type": CellFormatType.DEFAULT_FLOAT,"width": 20}, ] } OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"], Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"], Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"], - Constant.MODULE_TABLE: ["E1:H1", "I1:L1"]} + Constant.MODULE_TABLE: ["E1:H1", "I1:L1"], + Constant.OVERALL_METRICS_TABLE: ["B1:D1", "E1:G1"], + Constant.API_TABLE: ["C1:F1", "G1:J1"], + Constant.KERNEL_TABLE: ["D1:H1", "I1:M1"]} + + # overall metrics index + # computing time + COMPUTING = "Computing Time" + + FA = "\tFlash Attention" + FA_FWD_CUBE = "\t\tFlash Attention (Forward) (Cube)" + FA_FWD_VECTOR = "\t\tFlash Attention (Forward) (Vector)" + FA_BWD_CUBE = "\t\tFlash Attention (Backward) (Cube)" + FA_BWD_VECTOR = "\t\tFlash Attention (Backward) (Vector)" + + CONV = "\tConv" + CONV_FWD_CUBE = "\t\tConv (Forward) (Cube)" + CONV_FWD_VECTOR = "\t\tConv (Forward) (Vector)" + CONV_BWD_CUBE = "\t\tConv (Backward) (Cube)" + CONV_BWD_VECTOR = "\t\tConv (Backward) (Vector)" + + MM = "\tMatmul" + MM_CUBE = "\t\tMatmul (Cube)" + MM_VECTOR = "\t\tMatmul (Vector)" + + PA = "\tPage Attention" + + VECTOR = "\tVector" + VECTOR_TRANS = "\t\tVector (Trans)" + VECTOR_NO_TRANS = "\t\tVector (No Trans)" + + CUBE = "\tCube" + SDMA_TM = "\tSDMA (Tensor Move)" + OTHER = "\tOther" + + # communication time + COMMUNICATION_TIME = "Uncovered Communication Time" + WAIT = "\tWait" + TRANSMIT = "\tTransmit" + + # free time + FREE_TIME = "Free Time" + SDMA = "\tSDMA" + FREE = "\tFree" + + # e2e time + E2E_TIME = "E2E Time" + + ROW_STYLE_MAP = { + COMPUTING: CellFormatType.BLUE_NORMAL, + COMMUNICATION_TIME: CellFormatType.BLUE_NORMAL, + FREE_TIME: CellFormatType.BLUE_NORMAL, + E2E_TIME: CellFormatType.BLUE_NORMAL, + FA: CellFormatType.LIGHT_BLUE_NORMAL, + CONV: CellFormatType.LIGHT_BLUE_NORMAL, + MM: CellFormatType.LIGHT_BLUE_NORMAL, + PA: CellFormatType.LIGHT_BLUE_NORMAL, + VECTOR: CellFormatType.LIGHT_BLUE_NORMAL, + CUBE: CellFormatType.LIGHT_BLUE_NORMAL, + SDMA_TM: CellFormatType.LIGHT_BLUE_NORMAL, + OTHER: CellFormatType.LIGHT_BLUE_NORMAL + } diff --git a/profiler/compare_tools/compare_backend/utils/torch_op_node.py b/profiler/compare_tools/compare_backend/utils/torch_op_node.py index 690c46cd51c..69ee92d1232 100644 --- a/profiler/compare_tools/compare_backend/utils/torch_op_node.py +++ b/profiler/compare_tools/compare_backend/utils/torch_op_node.py @@ -64,6 +64,14 @@ class TorchOpNode: def device_dur(self): return sum([kernel.device_dur for kernel in self._kernel_list]) + @property + def api_dur(self): + return self._event.dur + + @property + def api_self_time(self): + return self.api_dur - sum(child.api_dur for child in self._child_nodes) + def add_child_node(self, child_node): self._child_nodes.append(child_node) diff --git a/profiler/compare_tools/compare_backend/utils/tree_builder.py b/profiler/compare_tools/compare_backend/utils/tree_builder.py index 34c1fe1a1f4..d5aa787ac2c 100644 --- a/profiler/compare_tools/compare_backend/utils/tree_builder.py +++ b/profiler/compare_tools/compare_backend/utils/tree_builder.py @@ -23,7 +23,8 @@ class TreeBuilder: tree_node = TorchOpNode(event, last_node) last_node.add_child_node(tree_node) last_node = tree_node - tree_node.set_kernel_list(kernel_dict.get(event.start_time, [])) + if kernel_dict: + tree_node.set_kernel_list(kernel_dict.get(event.start_time, [])) else: event.set_name(last_node.name) last_node.set_memory_allocated(event) diff --git a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py index 7a33168da37..58bad621b03 100644 --- a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py +++ b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py @@ -12,7 +12,7 @@ class WorkSheetCreator: self._work_sheet = None self._row_id = 1 self._field_format = {} - self._diff_ratio_index = None + self._diff_ratio_index = [] self._col_ids = "ABCDEFGHIJKLMNOPQRSTUVW" def create_sheet(self): @@ -20,7 +20,10 @@ class WorkSheetCreator: return self._work_sheet = self._work_book.add_worksheet(self._sheet_name) self._write_headers() - self._write_data() + if "row_style" in self._data: + self._write_data_with_row_style() + else: + self._write_data() def _write_headers(self): base_header_format = self._work_book.add_format(CellFormatType.GREEN_BOLD) @@ -43,17 +46,39 @@ class WorkSheetCreator: col_id = self._col_ids[index] self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width")) self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format) - self._field_format[index] = self._work_book.add_format(header.get("type")) - if header.get("name") in (ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO): - self._diff_ratio_index = index + self._field_format[index] = header.get("type") + ratio_white_list = [ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO, + ExcelConfig.DIFF_AVG_RATIO, ExcelConfig.DIFF_CALLS_RATIO, ExcelConfig.DIFF_SELF_RATIO] + if header.get("name") in ratio_white_list: + self._diff_ratio_index.append(index) self._row_id += 1 def _write_data(self): red_ratio_format = self._work_book.add_format(CellFormatType.RED_RATIO) for data in self._data.get("rows"): for index, cell_data in enumerate(data): - cell_format = self._field_format.get(index) - if index == self._diff_ratio_index and cell_data and cell_data > 1: + cell_format = self._work_book.add_format(self._field_format.get(index)) + if index in self._diff_ratio_index and cell_data and cell_data > 1: + cell_format = red_ratio_format + cell_data = "INF" if cell_data == float('inf') else cell_data + self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format) + self._row_id += 1 + + def _write_data_with_row_style(self): + """ + 带行样式及缩进的sheet + """ + red_ratio_format = self._work_book.add_format(CellFormatType.RED_RATIO) + rows = self._data.get("rows") + row_style = self._data.get("row_style") # 行样式 + + for data, row_style in zip(rows, row_style): + for index, cell_data in enumerate(data): + cell_style = {**self._field_format.get(index), **row_style} + if index == 0: # 0 for Index field + cell_style["indent"] = cell_data.count("\t") + cell_format = self._work_book.add_format(cell_style) + if index in self._diff_ratio_index and cell_data and cell_data > 1: cell_format = red_ratio_format cell_data = "INF" if cell_data == float('inf') else cell_data self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format) diff --git a/profiler/compare_tools/img/OverallMetrics.png b/profiler/compare_tools/img/OverallMetrics.png new file mode 100644 index 0000000000000000000000000000000000000000..b130d3607344c983a9304440e38a45fe96a4bb56 GIT binary patch literal 66941 zcmdqIXH=72w>GK|3i^maK`9z~Rhk6pT~NT#M5P8I9So6<5C}-ofb`x$q$tv)geFxW z^j-p?_YyjU651D@x9s=X-`?Yl^W*$EW3aeqa>KgUTC=QcUTfZgS{lkX|Gf9-rAwD? zs;VeyU%GUq;?kwd``5@x-#GP%tdlO6owSvoTq@`Wu8=-lu~JZ1xOAx~?8b@7Rnq6{ z_A2^Lmo5PsFWTi6yYJ?gE}dtoDk;2jGhVBa30+X>=e#GCbCt5!^fDuiCttMp32U@w zzKmza<6AuNaJL6FH=S>!*D?l2TXPchn|QWQ#3xjooOb0fr>vVYC~*hXbD{T%NeNN3 zV#3B}J?BSqY18Ms=O=R8Q5>o})aGieR;o!CZ6!*Q?eF5sz(8Ne!RC})9rAepsO}7V z&ZZj4@$a9u0f$p^I)Q7xECL_n;^H)!q}{fr9hTUeW^YB1{-{mLF|hfHP$;vUSEle+ za!qA5``cF>?mlX0_M@O;q7aXtCiPY=l`zd_*>FXd`LDaej~ZMe=h$z-2(bk)iq*ai ztEyN<;@nH-LZ4xBB}_h50yxO}rIZ#op#l|K@P)axgy=@4RQbO3uwT>pPko)bs%|Fj zZ}^%cmHXaFv624a|qXoBi4P?G&)N)okBgNVW?W8jjnvmfID#}7M#9BR+Ap<>t*aGog0?+ zKo)Uu7K?!%bEZ5qGCW zGOfV56GNE2nd#XTAu_HC;v3%}mVl(o_m2?%s8Ba}AS%v~F^)-2#x37*D-iX}-M(}w zPF6Z)Hl1o=zuqsslU0TTa2vkx6YJ=9z~iLoA%O}tw?%$YJGkrqr6x^>vHQj1&?uQ1 z;y_!9g{q15fThfAN$`NzqF=|hi3w2)^0wh6eakLcj~+&8VPf_{P$NO7CUHlB(>>{R zY56|CR_uDs(Y2?t0Ei*7-N!K}N)Z=8nViwd03@mr1Z?mO(9IK zaPaQdM%Hh(hP#+>bFk34bDcJMO*)+a_8r2Xqi@elHJ|J$AlGQ-F`^Y3ONpR@(zqi_ zf=%c)Q~wc8v+HHIFqcfg$y#ml(KU4Fum@lXB-a-dHyD)^(&!r|)Sau5ZdN2}j`*+O?W$!1S2 zORCbo_4yBgJ!yx07Y=2_BO^K39i)m>=ZkKk&}9hocM69Qk0%}x4_rWU9)(pJ2T=ri zR!odU=q)(m1b!Zy7g3Z4$L%}9$J$sA3a|=$*G^R!0u-Ge-Z|P2uPsBunTX*MffGZR zaK#>703?GMFbHN`yn;}+OEzRCGCd2s(8Z%zueL}!Mb&F6e`FKBo~xA#Ht+oYr7K6| zl*t9xYI*e4(l=IPIi5ZY61~;F29}PzRh81qb>*u@bf|x1ll&8V^+)b%#PI#zb(~o6 z94A9@WrD}&^`|ZA8inaKx<+vXFka|MFs~EP)H-*GxkoW!StZRdG}r<@Q05KBzP_(P z8u{gI*6s((%G@^T%6&On^M)Qzq&la{n0K-sa~DZfa!X;Ji$J&0opCSHRzZfKwb0$G zPXrV?KU^=6C53})vzGPGYMEvl*o@~`RRQktP22#&rPrlsmwpNg^NvOuS%-DT?=5~o zKv#`^<}{X7Sh0(g))aUhf@uV1*1Zq=9)$oJe3D%|aab+iv>%T7H#NdBj&H%z+f%Q?ClN$ zr5#xxayJ9CFA)uEY)M(P=6FG#Djc!Z0bdy0V+TiovR=MoeEOkFFw2;AdY&)48@>> z2Sxik<+RJuOT1Ru*~wCmc5%Ij;v)Hi@X8nj=xvd^Tr}a--wg69_xUf@AKg zDzO82LYHvsUGJaKN3HZ3f$ZCi$NXsD=ie%ZVI{=kM5qba2luVm+^FA>z~l=Nn^dXm zHC}ZH)%1{z$nUM z>DyR`rxI$CXzi{OTN6uW!9tw1CPX>0Vtz~eS842-ECtFE4K{A1A~{m zBoYl`1>!0LJ@?QD*@0(l(8WI(;;CdU-honqlhQC$svJPdN}WNsVBwaL$K2|%VhvE6 zt4Z2^`sIf5BG+jqKOlnF1w_9Bxd)D%xQH-AZQ!{>^(|cOtFqwZ8YvTZ;Iwam~b}y@#pzocyh?N_ZWQGvP;IsGV&>hODh&tAE}w>zp6kOHA46)7tFPqJ`pAL@I(0JQx@%pNuCn8}aB{ zUkGX$|UU(nd9c1hjuoG;Rmv_XIaNaOl%LMuOA~jViUk72%Ow>V7sDi1 zU@hUtB|Fu}jaG>i#+Yil#~o{&H!WtIkVjz0^&6!eH>&BX(fP4{c=>9j!YQwCuXG5v zBpX3oRU?`amv=s;^~Py~C_RYYd(#4i=0!mar+~UR)N!!j0MYpEQb;AP2!Us?nZJHR zh8r^vUN*zNJWx(a;a&92^chF(L628FD6yaeaq@X7m;GAvcc6k!C=Tqc&JSKJExVD1 z_G0%x4fN~;Q;ThWh~JIgCgXg-VMccv#)OWta(2bQK#|U-SEbo7%u;HFs9whdyqf?&UrwH-X8h*0DKA>JP ze835ENm>7fY}g+lvIa0bY8axRfm~4-r*sgGW4ArozDIS%vGr3JD5r&tvVFni>VkN` zU(@qJkA){Pp+I9PlIuJb07URR)u)e519zKQ4}5!Yl|w$`>?Cu5lmmDRb+{-njnaEQ zM;#UYx*Mth4`y-mjb$^u`G$5O6|#re>qO)Jr8G_wx$How%%@>sS29EfXz=BG5l*tG z3Sg%Sty`3xo`u>kf~}E9 zU&cdOcf(~TWv1&T+O^$1+y3M;Zl|mvirkJ|G=<{kv!w8_RggrwY^@sV_^GL)@jZ)9 zwrx}Mwi^YHxT>DGJ8jsgZ9aXCvqVx(9IIfv92TU zNUu31$P*YU{%-ZxWYnX*wzKP&JRQTsgKgtj1ZQrU?CAJ!t7K&s+e=HXll|`)b<>h0 z8Jq&r;i8|Gn|fEplDt=G3VBa;8H{E2(iZIqxtpB1;Dzm%h0w?}q#rM#<(`teWlaeI zyfiBLDH!Q!tplxWw*l+bIgsv-pDRe7>YtH`3+-jCIF+BjLoE zsyOtk?%=4`^y)!&8pQ8EQ8eqjMMr;%4{jk8e;~h5Qc?O{sBSm=p#1m3@ASay zhFQ&q0ETuJV;=Pqi@u6X##{PQ<);Lh(8)U4JZYm(+iO0qRr~;(G^R(`<04comBc(Q zH|jW|=bnYM-8*rmvgN1D2#6Fa!pk^?gcoR>VqA)lgp;S=Ky(sHbi|)W4eXE<}Gz2%ZMZEEX`yd1cM979;T?I`%OAF$AfTbX&LwKA94_Ft( zS(F1D8R*s>LKHV{z-?33x#u#*_oHB*v%N?|NBe2WMnf54@kh(0eCY{gC^xe0M!eSw zq~lIdT-jD-?c?m){Va8J3nbi-vn4bgdd~q)qk@*0TktxS`|+EkdQu!16%q;Bz6nP< zL2Q^8-9xVE;$?O2xmR_r-f5!OH++O{K`Nzmoa(I}t5{T=&oaH7K*+6>v_DRj5_{BU zjIGacT6)ymlm9XppUbd#?_4!)fTZdEoMBX{!j!GL`dGzAuWDk(BV0gm&SKrFev0yE z#`_ZbDNKT-C{Gi-yGP_U?N*6ju`^ydQt<_^KD9ESas=pCK^Ey4h)?g|B~REkm_Y46 zZEY}EuKc3D_;RJzyvv}LXRf6zJ8hWiKXGC}QYFx2E)46&60Ruf8J$J(QI89*kc@W~ zuP!SCEbPlmFloF+<2RK_qBMfZ!TuccVHOG0u-1<}jmOnjcn zSq9J-D4Utjg^BP$6C2aZi-fs~#~3q3fd$Yb#C>LbzfmHP|?=T8DfZkTPL4Vkpn{a;8u&B1?DWP zNa$EPKWHdsnd8^NperL)llY2p{u$fr6jF43ucl)vYt6g!+bZ~aI%JKP9e0&sRJz7J z6I3ZTA-nou#bijoN?BiQ!g#Dwpaq%d3Mcv@@`GTVR7=@uOt4GGbb3sUZxDn#ZJ8z^_#45Jiw*pC? zPRH0fzQDAgLZ$p7yI=scH?#7prw@;UAzN!qq!|YG8!l`hp?o+*_fz9!iEJi>=xbd_ zWn_gVTXK{u17O*15GtrDxq;sm?*n<_oFsZRdaf436IvrBb9obfk#Il)s#86#L$3;u zh5+|-*ZO*md_n8*+_K2t{ItAb!nf9^HKaiAjz$34Ue2pZ z04n!aS!fot!(hO#R1Dw(8Kdd%Z4In&1z6XvC{D3t@u&p49c~9s`ftJ4e(jQkJ3Ju(Hh^G>M|Y4ZC{63WTwaO zcn9P3uBuqaRK1aB;fzb^?K%_Lq_yf(b=}NfmkfJco(I7^ciA~lT9Gh6NIy>3#OqZS zCoUb||F#iGV*u`l4T`j(d*Sb66q$zQgJD$ey<~l~s7}rDIFw{>_iJ;1RLxp=ksb1l zm!qH~v!kXXGikSvz9D1ePO+hngE8uV)hLf~EV?{KxA0YWB}f9sICsy*QH7ZwI)%a- z&(DQ>At)cvNc1ZMTAM@yBt29-_OOz4M%SL!M#J>k6r+?+uMqZ{d{uFBRIkPeX|i(*RHI;8kcYOkTjTYD!{7< z2U9qegu#+3UH|(OtB2U)VfTlVxa`M5vVU_UDjI1F72zieh2hJozhLD&!Va}v%(CMQ zH=AQl?zxy=^7yC@h~RKn6DW#j!Tc9Z*MzeB2irHrUi&J7Lf zdqmPGj#B6bPK|l+HtF&oq8f8*=wF1v!#7%7mI_I%q_*NG$@X_~rE%c)1x$oorEG8F zvlhN+NG;|6CL-M=O)ogO+z$!~@mJxWNG+RcHQ?`JHr2oROCFk%FI~4tZBK#m_TNQP zKnb{@jtc%Dw=ktGWgxW#+3)Os7ZYTq8HFyA+t>A9)H;1d6-jM?!bR!t;sEL!`(NqK zpY+jz`n{F`q(=M_dU5#=k+|jDbcIggmhNRg(BD_+|3`~e!sY=XLZQY_M+diRlnboIPcYLMkMJijNZfX zFB&=w52S3Dwxq&QhK>4(4#>?mGOLD0i7IwSiVkoe@G=chz_Y!ZMxsc3AgO!E(i^SC zi+Y^2 z*2^{NYmu((ut!y12eQ8**toneDBVsY;blB~bRk@zHBCX+PC;{9a%G+06X#mu`71m* z#^xQf@dOQZm0lrplOpt`3(74&C(WV1Cf@_u59Pp(z$6fkf~3G0ACzn+ov^F>kI}l8DfU#Bm+72Cix{+q z(eK|>!$#_tZ(~YGEH){8U?rUS!l!fmIDL1}vsdTYn2dJ9LuUF>Lmk4*26bU?gh`w= z+i{jx>vu+M^4KU9oQlY*o(gTUQ{IzCv#)CYa@V)fMYatWCSP`~(T zRQOJC;ptKTZZo83=c^}NHnA(ud1HaSc{ZY{5ykyipKmk#&Ac`f4Mbi`WKBvKl1*bZzQ(JrhriQ;51ul>Q_ zEjO6#JY`w{ij(&!>`_}bsqwm$q(AyA&K|j4DRbRn3h&at`y*E?ma`&q1CkUWiH?1p zrUTCsFdV}qy17>F|Dv9gp1(HN3|oF0>MXbYGOa`A+B#bH*}`*V|0!Y`(t(mLje%(q zS6#)ibJOxr8$V45Sfc$=*&zPtIU^t=q8JR@9(_RXy5)u~{BYP-(I`aC%j2?au6S8I?s5Jd*$Vh% zX$&2TM5 zd5=+gg+Q^Ynot>0ffe^riB*Y_LyILCo*>Rf=dZ?K2pBs^pjuV{{9Z zzUH%E{*!@p7NXwSP;kWgmka0ukmtSZ-(5JWXik?2#V!Ut;s%a;6;+G?UW%nuBjep` zW(9)!CdT_$nIX{e0na4CkJfPo9}iudiRyv!?nQ;1S3Y0)kQ$eeaqxs!^{N!qFfAl% z9<#5osL5bxZ6PO=jw0alP+N~hGZJuZ@ew9~P0KDp$#Ra}zQAK>;m~P|xfzRd)hf?Q zw~^q>Z-ijH8n!I|Ub!9(`!~xtEC&V}_NdS;#C?^$t7Agy3s_eY}=b6}fPOTwZV zfv!55^at9a_HFE~r4_cm-~vKrR_%ib&gU}?el2srEx1JWm+TIXMU|RRVR8F{Isd(w zOML5`v?O$xstSp870=91$w>9v^P>d1)*?v1fR(hX9!15eM<1Bxf8tF#@+S8XGxe3j zC3`h7UCWJSjg{}-hg8aR_#DS5;`R49sLZ}cf$VB*Q-*~B`7lEi88)1Io$skYw_S~_ zNhj&(p|j3yd)Q)SGP5=*PX(E7#c@6J)cJ{gxro!CQ@Gboq(Sxc4iTOG6&7n|e=d+;akB!!*Xv5&4AVJ&vix&yta(=c^agktSGlp+pCGCCExN;@g23^_n56mo-5qo;2r{s=0OH))$NbhG8wbU3PqF7S_7$jH_PJ-i+-D6@ToSo ztZXurono~}!L4|n_)m+jqO78>amRloTRJFZK#sVwK{3BDJM*?b1yrkVe`epyCP zSb1~dJu+Qn7fmppWsOB4jy{FPppE}!#R5T9&k<62UDk> z_yBx&nauy18@Uk$TQB zmVsH=bpb=vu0cwTs*t43(u3rSZBmAVcKYY`r50K+&@ zn3i5x8W-1Y@I)S?V6^Q`n?QQ>K}_6??Zywg^^b=M7cp69$>&=!gZcmpdW75~d}!G<|B589a=(?<3gW^91r@F)U z_bnc`s~{pIJDX2$SlNW`n7)m=YaD5^Y(bArhfLL-fjx#YTWP2fdW8dsfNnNTOJQF1 zU#*Wnb#Bi(c~H;RVqwZXk`V#;A6*Og$v7V&FRi{sU1E}(JN%@O0lJJ?hn#<7K4Sd3RC?$ZzCN_7gT5z~aw*JL)fW=NluHFn8l!Kob5u#f5eDsRp?IL++^$pe4h!pu}Cz+uaY1Ut<)`du(bX~5MJR_7; zQNgCVBdOH8-L6{v?4P3>yiFS9KL$z?JKhM-P>jAAj3Ax-a;O@ku1Ok8s1B?Ak9A25 zwp=GH_k$%t*4!0mONMKuJKoiC@QvnkvCaj~&Yo271i41!50A0~@4h0H9iQuIZSAFY zHRRBK7p9lT=Q-_P&jW)$Gw*xu)%_mHrDW$0a^KrcqCv#%nx;@UGd;7tY@2`Zyk*!6 zyk3Iru~D9k(_=@bj6Q|uOX5e0^KT#3&^yCSQ}su!`QDYnh(jMvz(->4j88U+#9Tb% zQ4yhhlZL;}luwqT=ozsJovnCW{FUxy66ju;tEQ=}apfH`FB5L9X93ooQ*I$ML$ol~Lo+Yi_bN7h&_x1IgKb`D+COa+!-T3Y^7#C| zNlv%Ok6~$3=JXA%eRdMwbvH26m@lI1{Y3ZxSr8;+vR#kn)y=x0-QLnOz){^tA(#6* zRiz{}G)ig9ninR$0ai749Z-s4C?++-sm9?g`Wvjkk6me+p?eVRE(&lPWQKQs6JC=Y=0q$RdHQ`vCa=s zYv)V0HpW}~IXx)sI*$xCqklyW!937O{BnDY-`A-{=~wT0U6T!2(Sv-A#MW-NLk3aW{aIJS6cp2b@tNvyi3ApaJD`SOIwY;r)SqQe5-N6hu-65 z@ElE<;$$e!a>?C@e{ckRV#HS?e%*%~96WoVB9*|}GnJ>!ZPg;S%qH(QTEBWj05+O-cp&PS6f z=^{0W23QZrYV*7Eh`>VQlvd?PFf}B9Kh7LDR}n`%)n`T5 zXX5il)iR5saENl^EBKAl9hxMeWx$~1y^!WE*PC_sVhStA>e_N|!^(}JZ)&n840N^` z4Zj@=(K09Lc^67+7rt*yee2I?E`(+^Yaf26);YjiPO5x)bl*GaWcFV8fr(zyR%4dB z{fxo+uKaO6f0qGcJKAWOLoZhYgYldP!ZXhjgXic6X&@6_WZfoyA>yIZO3zbjT$62G z_=bj^+$xzQ1Pj)a*Q+V>^S;{d%feG4)FZ=`4$;ZVbuYX^@oSfrZc`Ns-KO;JFKTG| zv$VtWVc+gpp2mTL>4(mtD0SmFoTUzKB&VZD!$-t$HUIt4@E;2ep!Qg|d`vV{C;WhO z_&0By$Pj+eU^AGkY@v;=%(PdAl)-H0(*+r>n;JEfe53Hh*E9{Fani5jiUpIYq_(L_7AEQ6B0NnQm zT$vieqX;zwe1#k|R6@#EAB9<$_{5W8V(kLc^F${(>0QjHxhqQ9hv-ujCCnCq!!Op* z08x$U221j9WoFCx2&Y1UUxlHHhWbxO*O2dm>V67|s-~v=bdV7AEZnJ&K>NvEWhia{ z*BthEUu8%=M|h2WZA$uD08-vT42rVzG3k)a-OEJ>PGJ}!PBnF zqI33$nimD0zlp5@P#cEe=9Ua}Lb`}L1Y7buBK`-w*aCK7-c|TnB9!>rRNea@;T})t zRZ)6ffFr{8cZV@H^0ml@s(qWZbIn;9HjBZknyxVYXBovJL$43=nMm%5cvaN3ymidm4! z?K~Q{YiKg_o}L}ezuM_GPq2Y!2jm1gguHeSUg%CNAC}5Mi##1kXj&&ElQjD_DbV$W zeoJr&;oa!)_Y)n;DvATltii8T=*YGECM~e^Ci>IhKDcLHN;$_P+?>jwz^bQq*CKbDgJqqH2KN!{v$h|7oqRy z0_r);PUw^ri^E2P& zV#X$wgsMK3e!kXxiSebu_9s$+d$OsY5R(3R^ehwCnRl_%%eS_#sg3@WmX0EM`IGJv zv<@HBRu{rBDTlNYp1hlrO~8`kK0SyHh3M(qXSM4|dWK{>qW+TvLi4Z`dceV@sgNb8 z=y0UTpw-<@9k%}}D8=~byZUhI-A#qg7T))pzh-r(1J3pfF1z!6B3!>%2)Y2J$h=&R zjIGLi!ZTGj8;P=fjpyi9|8UMZUl6mZvyor8U7v>igunf?l!Oj{cqGS>3)`~W0+V+J zbEJ>3sAA-FmhEDM*)2062{6XgF*djv*KL6j7XwsuaB4|?pTYWc4h3fa+1*`7fvyb$?O8$TC?HNVMILhB-d0t<&<(7(TY{O3c35YLnL+ z;T52uE){@@ty6Mj>|ej5VLP%kvOD$ww}G^8eVs_#?osDQD8%O2^_`tigm6;LUPt)~oSCEG;i(SW8=kczPrWM90V6Rp8lz)Xs?pVj>r`GH;Fz`B{i)KD z!bvY+5A$mI+=WatH~c%D2X_MV)SH(KTq5%6)z2kw;E&7A1b=74MTS$04q3&Sf(&8D zBly(g*StE3RkoUM39v^|MS036`&g*Eggf7F%im$r2^j(YksK+HYm{O%tVaL%jgRP$*XGJGw)P9+dhH)%mifEo>%ZAI^SxeUqiwdbMrKxN* z1GMuVQ;w6VrvBww%V%6|`5k6Y^6qa1Y1c|Wo+;TkeKwht)kT?i-_**IrrnYp&mmC0 z7AehT=#cG}tpyBIShC8_h}CSu$M%FA&+T_e^4~X8J$izfH3O9}G-umAhUXUT2RBnS zBzExuOFvSC-L6O@RNdt@tQ`z4a?wHlxijeT(4EgTq%rIvX*Q>>cf}K@iwD;YK$#k~ z4GUc}1|;M*zIi@P%?*>J1sbe6+dnP^Jp|2+>;KV~q&SQX%~>3{SMorU(6?i_qq5`g zpGLRPS8$sQ2rU?H(9|Ap74<~jWk}i-Gwy8tHGu2vt-4!B+3Vm@#JEDoycG?fws>2@ z8`zUXVg;_c>s2{$ITj_D-hAKu#Yq_ZVvO5BR%|{@6$N7~XqGce8RnmtjxAvRSPNYl zSuHX#O!>?4P-YGtv`o^^%q2e`7zD=TFD2^8h2! z|Ct1Ngzj4}sU2IadO;E29?!lMA^Kf+K4Sj*b?@zVx*uItujx+Z(T&-1^K{QJRY!sj z^++N1&F)UiXIOq1#l5l5qNPon(6f{G1QUJZ$?~|v*pNu&bY`oct~GJ9!$jF(9382W zPN`!_U?J?2+oYMaER#{)y`#9Re%rKp`7&nP5|wha<#YAy%>io4`FfS*@A!B1Z0L}& zn@P%?q#(%?(DZ`e#$Q(QHm8!u;0#S*j*f3fR2bgC7fb`s%y0dUuDL>jVSq!;JFi(_ zgn|3{@Fjv`!e~79OU{@TQYfwcFkc9@kn?6BQaO^_yevWri1=dTCG|}`qUeOy{rZQ* z8VZ4({idqcqr<9%3S4jsI=(i>&NavD@{3aLuqPu*I<(k2Cd0=l)9{z?MQJ75D@MIK zmaOCQxBCZsGCT57)Rns3Jx z{@!Z)Lv=Cw9XO?F=#Z`UOpd0Kao*+ri-Kdpr8%mQ`P2AOwP3otQlt0%hKqMW2f0FN z^xKsNz%JvjKe@h+J?RN=TuNpy4sO=VP1SsC3$rIh%bt$YBE8YyeiyNFNRMIj*FI!m z^_zHAVKdqkD0^XvE$upvH}W_mdzIU_yB_lDjzor?MM~$k^-ket#VTGIInO>GrQ3}| zO!vZ4VyzbF=7#>r6hEd`p-mt{aINtVA3bh)1^QfR?T$X{7Ew(N{^@{BIVu}9H*|4` z+FuP0du)*h@f<)dvu$eRrRW;x8S)n3C}iMrlZq_ncd2~99_|$!VPLqhJLL$EYl#$j zN^-@wyu2lvhPTuRZeh9_*>3MXXe&OM~?I@c;F~(qLjp%w0E+ae7zuTGp%st(_`qAcXm~IPuQR{UADQWXKhWj zm+A7|s`IQRr|a$juiCnYr>~tocur0{y^%+rer%7Hum@V&EeLPlu<{cosZ$Zz$^Hsq z`uMqw%Ltd5QTsBc;o2E9cKqPm@{taIRZ0K=2fEAblKz$m(!{P@~`>PuzmAuQ)H}~ z1%1zOu+g#!kpLlnjpXSc;&0#YGS&wN-;$o3oZOiY-0;R9tzDk)P;5rGRMGEHFkd-( zORX^lt#0}hK4s=vanEyPGG_Wu>+bE#Ggp)0C9jEMqI6iR*Y=g&hAGc}8I@ValR@(U zfhI{p^!utc`ws=aoUbNX3v&dE|KXUVAIYr`)I(03fF5E}mQ-BBj$eblmqD{um8nh) z8QaMfVgl4bF4ZA%caKf@yskwTRnSu0E8Cj>>4olq8uWq^dxo3zZ^1D$)Iq!RG~KW^ zjuh)ij@SaaGV^O$b?;E-_uKoJm$Z%VFI});(q6v2#)4wZbc*i{#9+Pq3bQ}@?XEHn z->O$+f%^6De5h!<``OdT4nql=3=&d5o}IK>h-vk;W}W&fV%)2LqTU1(F9!_;Dc$~% zB4N2>4is_yF}88t<*a&F{g3tPxO;?YxXYV@jOy|(SJ6@Vr*gLm60&Sx$Zjt4pM6?w zT*k5ul=meVCEd+;?a4Y2HcRVL0cym(t_P?ApLYjQ)GcJaeMReV$6S=G4tiiJpI>QU zr%)>q;A2@qRn)vSbIhIJ%_m_c3Gdtd7gH zt2N6Die#TLX$7wj;_+7(&we1Zeo8*V(zh5TU6N_BMbex|$Z}PGpfMr9v<{z^8DlK3 z+w9jeIHxP0F$0X9iI3Lik8n@ePu>vy~@216q@UJNrPNv3JgwCE>jsCpMhgM>ox3 z)gQkt|HuNs337}|DRG`}+b(h1Tt(~%F-s_W=hSh{iIa!zdfNvMl!;E!KgtxKs%6Lv z{;YN-h;L@Lixm%Rd9`a3G8+Aj*o5b(WGAHhZWy!Y8Jk{Ia%`^mXwJrh>>MmOXhX*)T(!d+<_(C>M(J7j(Yl}e^oqv zNTAIKONrGtoaKoxV_+G!N!l_EwiDZ$pr89H6Hqtn^uU`IDkDttjbRN>8QETsZ8lu% zZr)}ERAf(V-paQNRa_x`uP5-A*|_B$QzMJQ^${ww?V}esHBNw9b7aet>}3Uo*zQ+S)P8sWOY)73;Bym>(!au555)3IAcPuOL4wQQZ_@<#83ZeKMbPS`@Z(j zQ1JD)zPDlz{VJsP$J7A#?|&AbmiKhpeSvFZ&>B%Ijp012t@RccysD==DAV&>^LXdVW#R4G-a*3!r{4Lz7#h5x6 zHg!qN-F#E9efy8=9SSlF5?Z@02+|Q!JU|?`*S-fwxmSzqG{6$(m}m?geY+(Fp z-GuEc?p?!{+oe^%KGI8!l-S(Kafxh#!~9({nEqw2Y0(3^pPd=eHKU5PZ6o_d`IP31 z`a_z&axE?XboOj(I9fzM5lS79aI^AH+wzXR(Cx0OTP^iJyR7f*lNt8Hj|G)~(!r%V z+>cG7#K|3#98BNS=B9C|qe2Mz$mOf}UQ;6gA?vy&uUll|~m|8n|>HjOT! zt8}*Bn$KNcua5Ymj!!;ij`<@cv5(<4Jo`yf<`ctHetwSVOeRvuk2H@d9k6ZdRjTAD zd`u`W+mpb78io{Mej+a0mmf}4pLTd$;s`fiiz2%2`@{(=68H1Oa(mxO!`U0CriTB3d zwycnh|7FHcQXpK${r3CX+sEYXfb7Q3l&l^_d_dp0Ajjf33j9UWOGgb-oBACQAb-*U zxVg#4}n;fJk101Bo54~Q!u;3YSdoLcXF zT^)rATet6hiT|Z!Hpp7oJSr9X&m!HBaoYEPkYt@rua(fU|<{PYvD zzZIuSgS<};BO<0Q-fVs-Ur%~*?Qg}q|DV5HUWwQ6kPL0-#f@)JoghO`*s4tf!#QF;ydMRUPVsV_lgf8?u3?G12(!#4YEn@0*`2E zO5X5sSvKg6Qe`RExm?51GVa0oRvZmf*Ssa+ti392T07BCV%fARgWhc}vxZE2ZFtA&wWKi~Sn&fG`Q3!Zl%gDuw>|KEdUX z@`>*Kn^|sY?~Kfl!O~yo$8xaP8=a>gM=b#;hrxT6PR1Y8!y@?);LajH?)%71wJ6Zq zN2k)oqv=XWv(uA`k%K1K2IS4m4uLNv1UL4o?_q8UEu^36>s zI;sZrYC^`e;#)?`kz9pRU{OKrmj+W6BMXPiycujx;K3^_=`(*vD=-UN5%+sMSn~U0 zubri&Cf}~5cl}!a7Sjh`R>VGy&wMFg#`K4W_vNYy*MbesuOe~&dE?!hZMl2zmyYE> ztmpM!ue+AJ>96za1w)e!$p9Ok0nH$`ev*d|WaD;}_j{BZ1d`lA|J^t`Eg!G_THsup zzX94{Xp~`;X+Cd2Ka|lPYA(|*pv7Ha2x`86Q#McR)aJ@CK#4G|wv1Tmo3Ni~*ZNMk z8C&6vt;%8W5&3&$p0t(mGFILGE`Z^ZevumYm?X^1;u1om`NW_~f9uutf+win@@3=n zp?9yy9Sp_?ESs3|s&=yhBNoD%!FG@c(ED!FMI z{v)o&a?utyZr--xn~=o^9lgTXg-GyeGH8KQ$=eCYLEKXk<8X_x&D2g1D<8IVyu1w6 zPkWNecz=hS8nS=J)+^T9zMAKGb?N`G_ntvfb?eqB8IdR`IVn*xG&C6$5hMx*az>J5 zu#qMQB?vS@B-lhz0f|k{Ip-XjoDpb{oZ+ta4QKCj&)MfxeYXmi@D268&&m)BToZZu*M`91Wov0N|CPLQnA@?{7O0 znEA&Jpqs6F#!c#c+)G;MHD%6GfiBCp?dm5%iSu$Uava+Cn6ckSh?^kBzqYL>#K*;( zj+jspMdGK)P$!oJ_c_&W^d_52-+vLFXMt-6`)az)e>Xq%bTfX`lTfz!)#skx9%p4y z%quS@d}xF2n}o>#&;2~N455(XMI&&a%Z#PC;?}&%=pdrsGKKJu<$`{~<3=6To_pmq z?-ytG$ldlJG@Rr1ar>KXPEQp{ub|-(nFb(Vo`;%@P)V zP5r7D6OIv3J}CdSg(XA`(qzHx56L?B^n15>h>+9?d6*k)y1(Wq9C%bW*ri=VcJ~gO;$p-p< zoOPZ+M4o}{uZ#~V#XiJSj-5=O9!}rUNhMN)Ph!k{0uzqXioR-iY7Vt@<#Hek#vki{ zY;gR!Ky<8Pawlm(g3H^&G@DRSF~b>HNYT|JGfS0z)R%;SIP zYHIdc7EdGv|0&7zM}_@Ki}!2z<`T5N;Jr3B7zWS>iPkLmsmO)x`@Kl7)`xNoXb{ol zNq8yKAk&GwNZ;jk9sbRGnCCd$1_?D;UL769aVWgG2F|0T%*I$>MYVX&y8<}__LC_L zDsKAA=yMR~1-gf--5r1Ag+VuM_m-^!>$9sA&7Pk*d1Ip5r_F;RF8&>=&1n(q--D)G z0)Ncdb|j|hS{w8|l&ncZIU6nV{bICRq-_gk^qn(UtTqQF`&hltVZq#<3=RGUJ8CtX z;@cnXnrOK+wYRjMG4q=7;>f>5W+VDRl(IO}Vj%M8AkWv*q$x~l^*@+5n8R9^Bz|;&h{m%6lq_P|&M2~YJG~+ssRd+5+mLt2Ew_(okL4jSR47Nw33kwd=?AIOlNJ4uo zu?p*)cTNQa^~jd|XXxBtBfJdfW9(<;JI)SmqT)0S@ebav@wM^(Vt@U8$u z7>fy&M>3t;9!3X5*knmC#ckH&hs5iSNS9LsFFbj;4cSh6%&(^MZx5#MgjM3|dR`~;HF=3bo*0_fyoua&$9yl7zXlt{?>(2i3ktK0wN0XT*()c|!o=UzkNsWzPby6|& zuRE!u`RC?~T&w?POvb@jKNkm{X%pVS)KYlNjwKhTy1N>6SZe;`w4}|h>$&$LBxWuq z)8B<*#7r_nzmWEX_PEfDV5s2kgMu@xkqIV8)7hObj|esppB~yu<&d8H!T4VH;%MP} z@Y)ud!+(<}qqL7F#I;x5Sz2nfwAG~+3kCllmj=Bp2m6UV4rj~^fRqf%Y42Jp|m_e1RN&uK6` zEq7go`FPAa>~#!3GP+kRvFai6B8R=^I# zpi|Ejeb=Y_G_~l_V&N})_J*)Q+ob!)C3ni^up34-cVli33=LY-+n07PV;>+g02*ax z^E*iVN&=mw z%TlbGFE=x7EW$_bHog~Jrm)C^`g}?;mTr{QBvKxIm_GItUvH{!GgycsVZR|7`425F z=5h4r;?4BQ4g7FvqXeu#)Yn~Ox5wV)O?pY$%8Nq7F3^&$cPzi4+r^Cz8YEAwlw?`u z>;djNbV5ANY0X{kHYUdgr*R~%0*79FE4GHcJ=W_zt5;-iTPu1?qEF|d!h7+`>-4AZ z!3cL6+2z`!-A`$!^s3Guawwz{S3D)0sH{Ey4pUgNE;bYcbOX4B+jK_-QTgL0{x2BeD8z#e$k3mg`rg z3mfO>cSA@m*`x&gJ3Zpm-=_m{4xcFmF!qn(-aZ>B!Wx@TXJ$OPhx}KfmY+u-X0R;Lsd@5XS;Zb2`NnN0ji7WCd z&Q<1DGG=10bxUc%Y2{>&@aN0E&}WL4Om3#=%~TRqbV+RAd!y=}sa{ub+=bm4|Uu^LWylIfddaAgg|Cumsn1iNJ|rh1oH5v4wt{ zN{{%&ZP#f%N}w6J=HQv%UN5{?sY&yjlBo_d#p-5Q2hIZKtttW`qxECS`23^x`HVsbh8 zeba{bKU`lt>xAu%)qly_=Ko@b4N&kY#1*m+%Yr`S&rIO2*)e!YZgdrNGLzuuMy6GM zZ#&*En>sr}@nIBwh%fq%3VK6*1XL-=< z#^ny;;NnTc*>YCylnIZ~z{DCRfIzV~mzyH8^QcD27kIzj(g0i3lsN6z+@3KUVAQ^YweR0Nc{iK5FFgEZvM^3#4SfXKnsPAF!oxRYY}FX+T<_rpExZj$G9`blGM2M*716oqTC{(WXqVD z24Pg9i=}9qpzi}f6YN3ldU6WhC#Y9TkRDC;xc+CVU>W?;Z-TnZxahcBWujA^<}Lbk z%P8A;r{KAkpuq6{Yv#5mG2x<}3P>OB!igu-Km$`Lfb)rNF`VZAh3*=t$E)OIn2+zr zTR;Dpv&Q26=DPKvkx`(}rwkqd-s#W{+Su2)M~2aB*Sm%5Snb$9Ob>)eV@h9tsAJix z^|a@{%sF1{KPGCBb`UrwA0MeM|4kz3IzNYN((!{gEFOUoFfpnBdKiIDp~o#`bh2CM zldxM?g)33H1Qw%IPE_ zyf{t4)@{&~FfbW=Ie>0Ti19kS2K`XK5IVL2xTOD%RWPcrMo)d02IM#d{o>aEO%7X| zL~`yx!#}VzO*qWD|2<1XGZT=gX)C;K+8QI^;r+6c;X%aZF6HVapM)^MccMHXMNxI= zKC?c^fApcBLwPdyr2^5krV0Vf6H)FrLi|k|LVuAVNXx=oj=RK=Uk|R8$Uoti^9Dm`JssCVXLeqaUHV)N7n#&uvy+KP&1bIq$1o(c*A_tNtB|1%~pqCnPyu$Kd5pT|O z`B$IQ)QhYZN?APMMe|_Cnl|NZA7~}=B5r+Exn~7B!qwdNTrcNzuqpTXNSHv=zhSPF zo9B(ab#g1pe9Yx1-75=-R-GJbw%BEgY5dcnveTL;`7&nvw5OVJ?;oUz54gB=$}Cy= z0Yz}@f6bKM{!m)?F4;hSOkXocKoc^GN6iFMQ4S{D;uY(ve!+9Cq> zy<7Za2U!ajCn?Ylkezd7CKU_wDT25;=Kkiyw5Gc)HS(u;*0{EB{r69K+y?9#;}&TK z<+^#Q)g1%s=OMFgoeraiwI%ICnuB)82#QJ+3JqAWhL!@^AAP&1;<5JbY0 zim+aw?L*rP299Q9x6B2jHIS-urYsF4l2AWtlE2EPX;dUZ@=HFa2Moe%o>=aHW<>8_ z2aDWW-0F8~455xmJl`h2FY1@4mvS%kr<4wm=jF zYiCXro%lZ*y#B%KrSg|>7MQQKgU8LvA^<@F@|GmBTHQ5>ofiaaN0hAtO`KKD~ks}o7{>=e2V?cr=cRGvRYT+;HZ5TG6)dtQATf(C(~5^ zz$SS2YmEq4=9}swVBgQ0ga_V$Jz|V@ zNFp#u5g+MBKu-#$hL@bBC?n4dnhQP)CP@(%P=s;2HR@cD(MN*1CBk^6C|ntcWrkr} zhBH>Sd$}!DK}_7yPd3=XDfK%(>a4R|vTA;#(3DG#ig9~3PD|GbDp90MQmIV77NKqz zsL%}PmLkgQ5q&g?AvoBhG`xBYzX;+GIMReKviH*kvv%rJ`1~k{pBuotzTlqA1yRc6 znFZ^&_g+LNOoi(j??;J_^g$o$G3CBwV5E+Z=i0t&l@S4o7Sy`fJ+(dZGSw-~Y+rKc zVLYgeGk!dyZC3I6DI=UJvX!^m1klgG+7s6dr&Ja5pSt^4SYJ4Qz+Wj#H7IY31J%4Uc1iTvnTdcP(4fVa0zbfpj(;x;m6wc%U({Xb)2(8 z3gIu^r%uwtdZg`(_l<`RE`MZopG7IIwwmvnFqrQ`#jkpTOO1_`-;Wy0OHl*tk&p11 zs{UbwT>ml)8K70qmB&&(SKp+2=?NIR-0~2B$T!xtB>K!&2xjh7jidcGBOqf6BMU5h zP8OHHupV2vtfg`{Z#-A-OQ2NiTGbs$7 zC@9G($2R07xhq=a{Emrn|Y zQj8Td7EIgdtjaXk7;B8*_iCyIb7jdrVb$_VqsPuN0`;uGIPP{Zl9x0(FP>E?+>Rio z%Y8Xrr#w+Pz!un;cbs?_4=ZzLcXK$$Ll>%uq1Hxhl%z;pK-4NuZ$*EvQ8#ilvbm|C z#E?{W7<>`g-$JSgY%-eP0H;l*nv)XEveFl2j+{4ey@2i_F!(Ht-s?*)+gc9bAr%|A zZhQ|$3ch68ESZET+bzs7TG4R>-^Q{wqZ{}~Md9WY?s6jVB~McPa2vLvD)1!jhlisc z!s~^_u}gb-=5FdTx(SEjs4hOQdtpNN}8&=IE`ww%+?`xJ_f&kdUrkz zS?OFW6Y{#fT-`Q*1t1H_RAQDF<^H-B`p-3`%Pxg?@`&fatVyLF{>p*+j3a zRJVl$t@n3^nd%Sj>M~epDQ#TO;9Q~HP>0Hc$(GyXYB45qzTDa&;JrNVDTh~fmwCM2 zzYogv#13)uAHw^pRau2xb;#7Y54)7|hO5S>FP`i@dB^0rV(2^O+|(Ar6Ev{K6!EP$ zOZwqdKxv|K^$9uK%7@7?aX?%esUis-ycemG`%&A7wF1l4(>>)xb5wId(PVC|%4hV$ z!t(u)((dj$$}kpBV2AU1ZI)c)!rv!A$uX8<6VVrRu(F;gC5u=Ewg)8%0$U}P9k_N? zs`?9Pr$E6>$3)b(^AMgtg``uzg`|)EevEg-2`eMPx&8Oie52rr7jdFnFfn?d)#YT~Vn^gb6e~eH^61FR98!7G`Rw=% zTA|3es4`&>VnNZ*y#jOuz;Dj)BR^dFuyJ~0?bsr&l{e0!0p+B|8iR>!l|yM}Lu$5a z*17jrp~yTpkyC55^#v(z^qG2iv4kMMH9eWwh4X>ets7%PeKuHc8E=Q-#0AyWyPt{} z?kGFq3a>q(XM{G~N9}wwza2IsURKPmUl}S(do-zM9ye~!6UY9ZYjn!RT(<7}`D9{zCaqu{!KVRwN>iZtm@9|gQ!rwT6~ zl2Q?1c6@Gxw=b@>%G4`|2u{jS4L@aM00xQ@AyR;d6OBCq)=JX!wa(1 zPlx4TMJXINU=C+Yl3VWb=j+byFXuxA%>N zfM0y4_wpHFZAlVs96dAuq zo9?t_eP|Wm7gb84nYobsT2IY!uu`cZx3@gHeefg+OSj=Z`8PL|sGV&e&UmqDd~XR7 z@`{LWjOAqcZHR=-0CB=%)?(1@0 zwV+S~X#VPYT|@Y0Lg90)J)HPDDgnoCoN=iCp@HUvrUgqsGBLuS8A-oOc2d=g@na?{ z93xajzbCt#4HgJIr(JkL;d{kqhx(xvcQBc^41zBe7d=u4@;#sKF(=# zbq??zXX%Tu29fMu*?m-~$Y)E_$oyy+m2+L0UO1Z1W(;^$cxCkM`5*EhN&P~Xc8Rnf zGT;fz@%seteL*R>DL8adxw%9Xt~Wv+{*;bwr+c!mQ{Rc3RzGd*7pK`#Y##q`P>pf& zJj_L+s>~d|DK$|xmi10jaq)HsHcWT3$%x9NcCde_bQp#rTf-jnXin%Ynd64|t58gDF8Z%Eb`oI(Kz7xoZ3=|4T;M1b(qREkoT|LA1&m#Z@tg+U~4 zF61QomJ-V{ww-v_o1vSiWm*@7nCIwK^LdD*MZAkS5NNgVkfh>ciIw^=Hy_es95jC> zzPG9u5XvN_AbN(|?257I_@w+uC{jeDa$g-@$5KG8-+l8}U|MCpUtRv7mFLM{)ccQA z9A8)qO%#3-8zOPi?X4+9Xry7EY4HG%ALY533;jNVLk(D9p@35N12uvJPa4nd^pj5= z#v(|*?yaK(CGvAETuhXYDeMmI`7x#Mp}6@jjR0d^H`-n>Z93HOnt@41t?Sd~H=yH{~b4(X5#|A#t?jm65qTjD>~wm1|wx z>Fj1U&n)$%CPc80=jkyd^Py=tW*E1jX9YGD`#M;rgJ7)vy)_j3CyA=DeGhNn{#T07 zO=HaxkI}Q;jSF$|cKk8nU_AiQzGM=jt5g`@2?3ea_W~luk_{|oZtgi`cOZwe0a};V z76Rjy;#6iJK6_cVQ-0*owg`NJ@a)oICvT7+;$(!=o;|&6_Gt*Z0HuZ%A2+pCb33cc z!Cb!0{OaePNKj7E?`4!cUq$sn?Zj)s z2edlRd_11S;_Ut}&iQ|vJ|vwtmb)KdZEhbQV<@#);r~y1h(Gg7|0hA?KLlz1&R+cw ze*6#n3QPU(SW-Vf;JSr>%|!gWB2F`0IVxmk7uE7=7K~@~ue+G9kf|w;7I&|j7W?~; z$K(6UNt0k#cUmL!BHL!`ov0y)M>egH#`-LGXa8Yz|^_G-A5L9on2xlK4(5+`Z@oF;_qyUe`rk8zr+Z*5I*PAj|SkTSghzykrxnH*eS%JY_Jvkcs()OJJ= zP*AwQC&|VoMx5_&O6Uo!Tdf*)m2JvAqR5o91of6Gy1Zb^G1LE8qugP~I#o|6L_(hl zXk=^xx_KHtNQv>FewnP+Pt*kC>TNt@Ds?AhE}-WCg;$*9;jfI*0~*XGuV`uSKfna? z#4I_nJhtEur${$sslU3S{)al(F=hJZz^twQPvcrD-?|n-f5?o<7(WtGo>JFOU;-aZ zk$HU?infQ8OuYnjja}|fbikWsm$%vtZ6mseY z4Wbc4v4)_)l=CWptZ|5=au;v&Mm5RhuraGE1(WE0*7a410Pu@o+Qn2+xq(6|S&s|UvQmMp!JRChGq zvzeR_Uo6SY*k`Y(y4m_=9bc&U$o9h~e+*-i{z467YQy;07FIkO=`ZRTVANuKP+bgvB9-)|oKaDAh<8FQ}U%`ZL zZ(GQ7^Y~uXNi7G^#8>kk*d-hLg6n;&8gUsR_@H%b2uBzb`uTqp_}8`Eciz>Rjs{Lm zJ`*}M(#4=gwnB+60Xn$3L6C127@>M>Q~QYA%Fh-wi?q`5>WhG9?`%X?^lhnYbcwxA zI(qY_4@Vm!czY3R17B!mxlG--posfnBVL*r{rg1mcg4LLO>27mD|A}eVX6?oQ^9|* z$zJgRgAPOYy*fB>Hfy_ApGi8?%!hY|$0T6Wz)vlg(+=xs=_DT8w*1x|y1RW5=qboI z1(6$C7S5&ORfu><}0JLF$`#=u>pEy8`4! zU>h%_TqFSCIVB!={e0H&#(jx=-c)I$2l$Y!kY0jqR~I>eO_o7|&BsKMs*bgR$))55 z8A2qtGo=}lg&Wx#L2tb{b+5s<#%%>g>upLNHXEpz9AWPoV9=&u`+11u_ppc>NV=gl zmG(q#AZ21#UxI}0sVV*MA2(_rv%FokUlscX#{gEYW;A|ME4$Ft5hbXY1>;g#KWp-a zv%^+V;-DKb{2?Hb5*LNbI6qkiXAFF}^^{Wf==4)S4u40wuM`(h^xI$A?P{ljC{r;< zb&|&Fs4P=3Aq<#-K3Y;7+}G6J=q1ntWPvap{}hz%p3NNO_q^Wj7Sl^Qy4V1YWH)%; zRAhge{DQ4fHo}cxPHMXi@s7UPY%Rk{+J{-V)OOxI)#AsocVNu!yoSK({_@+cFr~&? zDKU5qe#WsM?ac9pV(M(44Mvb_OHw+E)b91L0vqi=8dnG@HZSOXF=aDihqt1esupKA z-{K7Ti{!W!q;xMPcC-l`#9zHw1UY+bl_zTPIE>;xpfumL(L{#(5%4`UR16T5wUy=| zl?HZH%rVW3vO4Igf~;cNR_<$UI?$Z7NmAPc*+A1bGqfe`boMa@=pnuerFIi%w9p6=?3o5mnyl(+$m-> z53?{<(qLe2e9sMU7#ffWi8t1dG$hFPTcr=JzDFwd&A#>pWsL=)C+eLTKDx$NA1-LL za^7h+(YB?3s82g-GcP?+u_=8kkde3&7UegQ!vBSF+S^0v#j7*DsTH9w{N6tkgT-ip zWk%=L4zTysk*)nZ1;;V2NYvW zt#=NGW5FIgyx8(ZD*4VZ8ET5PAv~ew3+`|iBK6PnLZsTCo5#FtG$9~JM!~Q#BmO&Ew>Xk&3w^hIRC+QLUc&}z9spWp0mPvG#Dw1ql3~q4e z@iE%^&`POxozX8v3!Esr%G0BynyTy6Ui`G})si*Xe5a*C5^`jXt(++OG&^1*+`mq& zUFwb87I*j_G5QSKqrp;PSx*onBPiw$v3$fEKq^o;XmzRD(1|p!F~Jy`Obo6IwqG*t zL!=G8&9M!q>K_FpkFY-^k0!6Ab5nXC(>ERmJ&&I$PmGdPT%7H|A{i4djz%hk@3H9+ zl$Y9%-ipM?{rN!?o@irbuCc)(*cQg$f`|r6mw7GKh}po>-EmEyxxrOXEvxF!P3W6b z9-t05OeAl#>v$OvwJ&nC=Oe3mYLSbx>^`Qp|DvDDE_xQ&I# zlWgeAz^j;f4{ze2w03QC-jFM!7DBPd(=(*X3*c6#*TMd!%amP%RBL%SR2yFzLJ%c(2agiNv!}N2zS05Q zU$`7hH*qA=kW>~ge{&P@R@MGgOBmhQscI7)i}}|*4h-FLjMQ_ob44DtKZF0F$6v=5 zM_JLy@)UpDoAWQ5XYkD{?)@@VCk6_goICF$!X4Yj*hTapEvJRumX^pw_sqM_ufjN% zz{=C4zejr|M z`B@n81}(WtLb(h!mv0|gol<`KKK``4IRT7`_xiLHN85{Cz~XZZ>u_fgQcCe>YYP6A zY{h^A{>|#aA5~itY-PV3+iFex%c^R~E9?U-y$Qxq+}`8j?;idsI{eZw;&91MB>+kp zjUJoBlJvn@4wu@ph63}N%)xaawMDN|;mW!VlbTO}@Yo@MG|&|Mb34mI!{Jph4P4FM zIYErC1|J?`R(sr@R?IT2xqA<%HhiKoQ?_kRu@&fX70%cT0SS-q(fnqGD$iIQ0%P2Z z)Z-7cYN5LP!lm!i07CPFvvfuj51SHhn-vxWb?cCtF1Cz1j36I}RmAQ)9Z7!R=leHOMRzZ7r=o^P`(0g!%RyX+*VA7ISH?;!hC8=VD4p8nxT~=^8AVz43Slz=9S5EvR+%_v=0h&)36o8u z@xaG%8lm>1^yG)LOrp?i*Xw8L2OkC13WxTKU0t!vAX1*Gx4N;FydBCeu|%^+bV;QP zPWt&jZE-qYa(~b|PJ!VZijyDXj&TR;DcaMu@qx}oHfMgY^pWP;!UW;jWy4NO#u#12 zGH@QCVFC*qvLv5AWGn<0VtR&MftK^ghHClqv#zCIZHtM1V!VYZ;(TObwr(7NN>-#$ zoo`y_B8l^jFm>2ef88$%8B$rUFTF29E&|o$^L9lGfG2wPLGv`|1yn4ZD#~xN5PWm7{ z11rNPZeE6v-=}!}l5r=dg$=;S=H80l4r{YMe={3A59&BIcUdqOa{Mp>+~X8#VNAd! z5GTn|;dF;W=aUjAgFEW8xBjmK@sJsFa_xtVPRdgr+47H0oIx7-{?OE9vUWgcL}R@+ z8UDevuu{Y=&3+gol%)~fMZqzmAE`TA@jjE9SkboRFGE#1e-;x#Lmmjae==rix-~16?M2G_;)!tZ$a%J@{r9@MQ;_=T+R-)&A2ys~3>=Lt7++{FH=5eu zuN@oP%{F84Lb4c(8{3(;>7}o60`kC>6eFWKnYn}7$BD0r?3YV- z7cY5Tz}{KAC~8FC>{Ngp{>$2zW6Vvn)#GJD^&b3JUZGRiwq=dEY9*SEd0wd$4clMx zyhhF_tl9u(raT83j*LY#^1~rhS0PJx%M%fYl&{VwumP%r<`rt4BNFx()4^O!lk0e7 zY8S`PLMh?bQqAgl^}`7fci}p9|DSmsVY)>z6p})bL~CCCE2*x`gPG|JU4A*M%262 z+5T%be{0{`Sz=hT2})sL_}(n~pgB*^ouB!uEl-?!M{egGf4%vT2~p9Yjvt!Y&`D^n z4!J@>0aqn<7D9HmskSDJQ8nQHBfwY;~z1)|^fL;OAaf&N%*jk53Cx)#GI zT>9js@WQfavhkPk+IH2JvMt$&Zp-g>+~qWOL;5vw?t%T^k`T)$9d-&CG%Rto+qz?K zm4*+k^(3 zac93nBh6eSibSYFZ)m{4U`9*zn*#9|EB$;zL_e zZVrEr=MPTggATp0_ElDM9m<9AHI;9B+q6qQ!xZiyx(RgOnqBONQR- z(%JSbrL~t&eoq(nHIspL2Gtm?Kf#Sni1L^DIT0&`Sd)nEF69X#tR=3cfjA%#FS5>z zTg8cuZh)ummEBQUtLb6|;48aVa|p9gmx4MOh){BrY=rDD>>=EZ@PNPwjfmD;mY1ow zpG(k&_CAI)XT)LLRS&&KIX?EbUlfC4_1LXLMjcHrd?nyJ5w6?*EGZM4OGPzqTl9#( zhWN&+N3`KN@lILREeDKe3M()+JQH$dTYY=hHWM=iH#oefG&VY|y%+*u{*qk36jNIh zK1WB##MvLV8r5$>?F%BRzt&Wyu>l+Em}IfOW4aI>=0JhL8uE-{t&I;(V?SOHi7c=)(Ez%aGji;rm#I3s7%cIUVb_c}-6 zZ+!!hwM%+Hns0N++3!6q?or;iqnE{TiIBP3te*=?&d@wH^c=iW4_&En1&;v_LX zp#5&$_9>!O<_O4iSUvIJVf--MV&f0#bq>{;4jl#(y%bDAXAHgiI~)M$_+44zxM_@- z7GH1z=es&xBY9MJ{PYwlb*5lhYbAxju^gfzl7JdIszaI;xat)OZ~pHQ1NsIw^nGin zmS&FUf9M@QvNH7J{5x({%ZT5ZtM~OeKjLMt81Va#76_1VAKCMNTUseD#mVKI?y0O# z`&V>cDVHnFIcW)eLF82oYh}?y2OA0N)mi)xm5crU`hvsHwo|G>yK`E}s|`}rOxurg z6}67WnKmWv*dl+ZY)wAbt?$%5%4|TVmG-|l0s>iOQ|mAD%s|VAAtGH}cw1O@d6#W2 zwe4pNVF^}RZb}Ry<`(V7T>I;%yijksIbB`bIr?c&_|Zrdz+#e^MJ#)D1?wLTXGwbU zY;R;OxDFpPT1=TJdCOOHw@6*X5N5BufU&4J#nH{pNpanAAJFGGW(fQb!v95 zb}-a^g}S|OX{aTEQdHIH@^U|Z?r`;Br{DORvP%IgW;4Nf`S(!CF|DdeuJT0(Zc|jz zaT)oI1tSdRP?*dG!}JO5XG5(gPPUd{bF()F&rjKua`IbyQsSsGH?ZHn4hGj> zoTh%Gzf^3y>(~aQdgbQVnr#&<-V__!v99?HlRyj&&vnr-fqhhrQWJZHk9!Ae%PpQp z)FvZNgnR7U3OAa0&uu0i&;K+>SO7_x{c5wGZ0c4*)aIlrKyIC;lkPAe{FjeWH@G{= zHV(+-)Yff3YNx3`NjHj|`_Zh!*(@d&?=rKh2jqE83p5_KuNBm<^_^|*_`V_)Q>nK2 zGQa4q@=L#KfRbj7dF`q5M9D%PNiMiS*nv;ha7rSEm0$T}(S!I}m^9;(7&$p#OiX>) zl^P~7sy&g2-?!vgbY{_?d-5u=jq1eV09$0X=XBi#EDtPWWOk07lZE4Jv z-5omX*j&`N+l-MC`mMb@{eC+}?TF66NMGs6!t?UH!LniVl8sk(td*>d@5Pk^ zP@lui>86Dn4@N7WB=Q6KL@S(P!j&DFFfEfMWrP&c{nI>6O$$l)bJ^`-u}(ncer2V1z`viDZeuSk@8O|uvpx3En**(n2Fp6zgWqol%_QJ2H5;xYRSgJpD~nU|s={H?t#wsA zb1HFjULTUvWw%U=>&q@<$`?y|a}N(o6)$wg-iF@V?0%FvrwJuySV8US=#HkguS(I^ zvpLF*Ti%NonOU5hpPeere9=0o-5 ztmOqv3J;i1AlfwMHIS&3qQO5drYNUuy!q-A+0tl?PM$;$%b5)`GL6LIL_RU48}0Zi zS95A4Sh{Y$lSC8Bx9Q}+sX&>yQ3P~aP^C9-Yt_}cwXrMhxx6EdU+HdEKK8$D++PN_ zKhRy-w0?eaOfKnjx<8jpTn{2`3$7k0r!n2UTlkV{ee&a@E!o-bikd=4{_W)54!CoD z@3O%-V_iM6YtL+9@?D`NNO7?h^0nJOH)&)}!HO{uSga|}&yPoDs|EMPYmeLIAMkQfGQZvvrNA`j`=-8#c^*xt@hihU5dTEn@v-C)ded^2SBE80|hl zKG9E9%tN7b^_`Y*sFDQ422y=_`aX+KFSg_aZdBzm0GE&AN zlkZcrE2By$Ek)hR&`-r@!zFvp(nFCoQV2`k$zNA96Q0E5hm5mRG+~iFKpNbdx6Wk$ z2*{X57S?f}K}`lEI(4VN7h?VRh9;yadii!&sj9R>j{0+? zznCITo5woNc#Qv3^{?G|`>nQ_GpYgIVq1IjFKHrKJHv#gIx^}bWV~K5>OfQ*AmMC_ z&9DN`>(s_|bmeFQnmDB8&q?dOXcd9y>(0n`xm)R^MR7;ljs~4@3fuGo zMbww-qv-D9U>K?SCG2s}OJm~~ioH5E2cm5C2+}BRjxqVZeF(B5f3sm$ze~_|3OV(# z1Gw%*7(ofuZ4I{X$(oGJ0~G}=4mXBWu3FT=JY=+MkM*S7x<#4x$h(#WL(mcM_n@o< zd`e=v8(%5=zD={Y9h5rYS3aA~!TztqPN$b-eb!c$`zDE$0>VMw&lHz=Xu>*mh58PZ z>hD?oryx`MxsL1o)2Y-xSsUh57;wvvRj1f62RghRk&wi;q?&rb?GpEww*qK@#jqC0 zIV>;p*nrUge*f`*_z{?(-#7C#b|3G|$<}+HQ&JdMKlrDdE5+4FCRrg%bk0>k#ceXD z8)kU_bx-V%^p5|fTIGKkzXkC8XQx-EALu;JpWqV8%sPE^Ooe6#PWYzH5J?F6*46ve z)m^J3fSvIvR$=(@K8x9alheZ02LRtXM|!yC}W2jRo?J@;jK-rHIxPWu|S9sg&+uJ;us?_Vc&Jt%1` z4lpnd1az(~1o`9rrnZ^@RGP z+p}PoXw26%>%p!ijg<~8YvX40&GWzj%lpgAy_&r}J-e}*h8!xLot&3jv*U$IS_Ac> z_W2q`?9*%uU()>`5SuK`d)|Sq!x$@?h2J(NhN)>(p`AF#PFHxG=`Z06!*RB0z|$5Z z%@~dFe%z~#@m!RPsV>%ju~bmcoMIdh(ojZF>ldi(&KR~UBo}c=o$z`-P@#0@-zueP z2|j`EkWPI^JgwTz7&>||fzqbcm2{n}>KoPUceI1Mt{+At9IqX6A;WKhfKyCByO^gl zRFb)LZ+!3_gVouTYeAc+eia+Zdm!cX33EEpT=-CDmQgZm+A?h=)+KQ+Z7Kc1ppb|~ z3N^O36Uixyx!0#kV_q>xz$S~GS5f1YmoKPs(AhG(8w(dpRilrclTXzLbx3GEnkV3Y z0}i2lL0{5F5CZq_aN1zKYX!RUyVj5+iw~x90w0Jz@AtvYtHQl=`YYaFt(HHv0#6TI z+|O88*S*Ry{aIuqHu;I_H#K?+opRcsv7RZ%W@LSPJfB8;u#}PfY3=0&#z_$jtH>=+ zOoG~6*D34|g8QKc;!suR44m(G0 zW|^&3GQ2m@a!sJNvRci8MAQZ_(>exX;}jj?@ZD19<2zDP2M_d!7XI7;q$-(}!S& z3y4n{@Yj zmy&zsS;YoJ_qv)$6etXwYlkaYQ}qCq`gRv+3#ZgX_#+Koo_7ZZMD-Dk%g)72d#@(c~$iKH9VR* zA1f+b`*(LkHUX_9tNXeA>b0_hUC#1dE%)=Q#$Of;kEx|c-ryXV`nSAPIDFgJReh3G3)=t z01&pwwRqVC`@{?sK|Wiv*zZoh_9XD|f$ZT>U4X@w&!>t8v-esTUZxE|bgQ{JDl}wn zhX^=)cC=0MlfwAnqge*R$3+h?{f5{etQ0kdNmnpwQ|cgh6*OdpUdivvm25_su8w?4tU}4FHD)bhn!k-cd;rpmc;R7;$)KU#sT4QK7#HwDaUuQfP7JO2WpMd@{B>f)nL1f-HUb{A=*Z3jEj!-q+QFzYV+a;s(#(dyRY z;H2^DhyttDo(=1>bMb@KSEzCfC9?6ylulogVMwD*|I%loJafBwxfD!sJ3yq69%*3) zY58NxT*nXbc@nsnD*mV%Ax39cAZ~}aZ#EhAF^eh@krxxZt2zg`l7SjD&plPFz^Ld7 zwFn>0YMyA<~V_vT8>J-QFz#_FG zH;|xt&-zl0ugg}6C%F(yFvx*$!_oeyEAQG+&jr`Q1wlR%T0=wvcer4~i6}pWt2?>J zX5>M9JW)9Qtow(T(j%wlwiQa>sKbY+7cchaUodbm|Be|YHb^MwI7CZzTfSk^-F{V< zMSc0@*Z)J^dxk~1WbMBa1Q8Sg1(7IGi2~XL2_gz2K~W?J$wGG%1<42~L6U$Zn;@c~ z7l zx`tZ5M0q+383-kUeh$xpRtof%Vx({I$eq$|kem8xU{DaSPY6vGvZCW1T!SYB#iYWP zonTY1tMzUPjNXn5g)wF|^xef~olNK4>tCl!m|%m6P~Q0$*s{^v!41=-n7iX_$l}fN zttO<{bFm|}8ffez0p9g$aU^ZA!(3^p7^|Ceb^ zfczk~yJXAGZ7%yi5biU;t6HHWu#>adH5}b>u2VUGFx%gETkR4g23a6Gh>r}n^f2@alJ1;E})vG=3 z3;u+()|avB1IklCVOv)-Y!pY(2Td!GOwH7;XS7r(6wKXpTJWz&zIFuH^I8(=_&i6c zr*SJ%kLQ3=2;(8MV!;EoGlvzdo0^<=3UxbEf@hglfXn4<$PLL46@GcqDdgpM)}9rZ zeine^&hX?+S$6U=)Y2`PY{wD{oe>;?A>h_U&c*to8U=QSnJZ&o4Wgeg=9=>{h>Fgw zgYwZqvRHOT**E^O7fp}t=(mc{a(yfz(>I%7=!HN>1lc!q+-r>?vT!^#U2UeJ&stJJ zduG^Oh<)iDV4a#{`E~dGfY3sQSPwxq(-1^)4iRW?WhXxWN)Paf=YNrI+J8TMU(EZt zT}%(g^+@=t_7@t>2V%{)aI>b=dk^yzxj#PEV0bEa8`5!=ZqZHv?m8YU+#)1q=QF>! zzQ{k!2T-lXyK|Mup~4G6hU_;U(BsCu1)5x9PDMhnw?@7|tBHdaaG2|+lx{C#PKV$0 z^dqMLxAWUcGaIiRsF-VD(cHvaC-CK9afm#)kVm$8C<$nV-ugLW(F2IibBE>k; z?YKS_CW77c9>a^-7k!3oB*qIbBo%IN#|4TJrb9%EtvzIn`8;oafQW1>Yh=4Hdl`Gs zCeg+>YoHI)9hXg9k>6%#mG|86?LN8D0n}_YU#jlET{koAh*bL4ef;wAc7~l3?C@N6 zxJ_XDkM#^t(O;oYpso?o*Az7%exn|Piaft;+;hK_ zXWuLRS!kJ93gEzR;(7UtLIxTBEPAg*=O?`t2bRq{O(>t%uL1$D>Q9Bf%d<6o_Z3JC zQg2NClsCR69b=Df9Ip!zxt_Yy92LzJ($}{O>O7H0!WT^x#$8~V(lHkr@ATqXPSXmL zN9{nctc8ek8eJLhF`GG*PLH2kh zESvNDTOonCkvx+W{es>*!9=^)Xc#c?Z9Gh1kW7HMedpNrH0k|`mxT4Dmh!0}#Y33m zWbKv}Kiq=FcJxs-5Sg69p@pTQ&uZ}Y78{&`hUS=RPN9kE>{P1#^tIgh7Rri71u7%@ zt+|K_huvb?LIc{&1+3}r==;^z@e6FB*+XAj#RuAAehfr{+c!-ew(Cn(FuG4IN$&PC zm_%Z~Z5v7z8T*9}&}kC?ZF!ie(-ifJvtsFm34Ga9$gM9&XIuI+9bG6(g0fd<0Jt}T z!3m*13hQA(TCVhp1sYQ?CY)HnK%aBJGe5D)v7Q3Dw{W8zG0aGvM4>J~too*+dUS+3wqtZ_@eRYSu- z12!$63dsCH1?y&Q_3LXTZ^TH|i9X?PQ&z%NK;HbQVXj*Gg zK$CPSv>J?;@eIf{<%(JO%)_oh7Wau{!zd|0_;H%i=Y{t_-ZG5QK-r=!Jgs=Aip~6Q zHaXUB5-)=*~*CPMwMf4Y2B4AWR#-#m+U*RWUx%hV{#eev- zoBO3HusOn39qsNaU1p^DaWL6=yt9~7>*aN{x3=9meDt+ttQ=nrlp1bTY6V_}G}552 zfHd;>crhu-%iQrJ9+uoBvW9`I1Ln!vvHQ_hQ4A=l;c(%Nlb#3{<|S;GbQN1?r|tj+ zxD1sHn*56HTn>0gW{ z@qU#h?Hp_nzqJ^?Y5cF@@$8JHxy2?vyqY~fxy+6Uv1<KB?UK3@HrsDZ63bu+lkSi{}V{|BfQ~|S`Vpg&G_)? z!yAi1uuoPQ^J~f}Y!lzxj1$$B#&4qe@;=Ch_q%ggLI7y}>31*%BIGT(>uuQ~oxfi~ zG@`eq(eQY0nrzPUq_X=s7=5}1%e5z!xLaR;l$*flz&;{zwn*^SW zw;Rkiuti={K%Ses;I>-wA!nb&MQbT9j+^~29-;W>XoJajPmv24aT~t3>%`NDULA11 zoI@V;(b2j&%i`eDRaJO^vS z(&Z5%-0%&o>O!(df_E8gs`;PauLz>69yf4Mqq#V7x6o<+@_8n0Mo=PT$T{e)(MsC~ z@&;~tb-t#E2SFyYcEG>_=k~q%MN1t@IQAoOUK3jX1atpnHT}ng;z3rr+y1bet6?qw zdQDxU#e?!oGJ|y+Y-gIrE|Ur9vgC-~zay(eazk&N*FH-kvhD7ctx7yu*ZE^*wJ_+^ zjXPrk8SetkpA};lNF$!q_o{WROO@KGX+p_bP@RHnFNip zLGCpGOg8_!dlnlq3RaNOQJ}H9dhB!41Es(@27NW5?@QZWB&VUA-%IbhXi?bcscqzG zwaRD%dL%l0K>5LS`UWZJABJonayQGBcVDaCD{^n($Q*E-=y#z*cxPt1Lj@5xUG&Zm zw9zgy) z{>A;kwz=r@8|?l&2E~XNc3}R^pvkuQ&aNoow~L05aksP=F7k8gc8V zd6V`}^Sv*j`2YOn&%HOHe6b^JifcTkkGcMho0}qHmPR7)5BQX9+&`GW{uNl2`UtEH z60+>c99E#6xPFY4k@4CckdZm=F=M}16(b{kJlYF<<1W+6?&ioC_t*MSiVne$w1~p9Kh;86c0}kurZ{lqpn(DGxQ;0UyfV2EW7T8+nc}-I{}+c1!)>08 zG!@2oCG8aT-kYQ4TYrpdvLKX=d&$CrFZ3mLytvezb_x>Nx0z7gM(d8=q_8*jK!72o zjRv=?C)G)IpaU*D6c#GHW62KXEAzy>Lw*x+)jqV7&MrN2emo@2lhxjnWcf_H+Y_A> zBr+gj%ZEPH8JtSG#`BfW%r_PU zq1?;Nblv8cTIreKj1?>ePd$bBs!+_0y_5Mr&|`MPv-J-$&8Rx@TGh1C;`ZR2;rqt* z-edmz!6{-B7Fm}WJes7j0omN+>C<5b2;Z44wK#yllB-)4)Bg_b$9S6vfulX;*thlC z9dXu++}hQWG{Ms5k#a2K%B9CRl4+!V-F3x?avoiV-5DE?Jg4i{2ed@aXUBR!rHeZ1 z3LMg5jO%9@n_H0u^!>NnJU?x{|Eh0Rfq+DA2hL}0hrJYVqgWh`<%7%T8_}Ui%cJ3o zS-P!Dbwd(Yijyi0DQ0T28@(NO`7&|A`n(O5{OV&a8LIt+CaXwsYmt>{J1=kjaG6Vt18lbiz2pw4m zKI94YhVXrq4pYwy0%9B+#hrZ3%G^ril=|Yk*pX%};}+%j_p1dU%S~Xh#Tc85^t&Sb zGq38NfuVxj-pu*>SG6QwrF0vRV}Xxf+Zb2oJ`Xs?k}MlA^G7Fx!Kf{JzN6!64|P{5 zV(a?NGrgn0_TI)BlkVA38GsJWEDaqad~ zz7Jhmr*Z^%iw>jgb%Ov=syFVmng0;!$NDV&m1T{`q0%y3PxEhu;YXBQNDup1TsR1M za%nhYwbDqWuBv?yjawkt@ArV%3v%Gc4bqs^4^mj##pD9F3B{b}XyZv;% z(%oMyBOv+CnkDMTp%W9vcA^AjDZ@lBSBUEWNo9=xit325V>hdgjufUi9=EwBRM^I5 zBKe}V3#^W(@GB2)F6(kL?uTUz{L9K}+b}7l|E}l5@-oLL@d!3>>AvG?;t;@C+ZefOE%3k(Vu=L!PzQJdR)CwD) zThrDGX~d?NUw!Xd-Nf(=cfML3*=>qykbiqs9Jo5%e8yL_3Vy8`R5KVHK6eW z(Z_va6z#4}A_~dT7`VT8l(^l0*>~g3!B*b{iVPe*4RRV^@=jOO&>^ex6kZE`=XC6F zmD5 zeoj2KLz_RKF8|OjY5!LT`p%r00OYQCv(_L8foe|RkfY9zk2u`dZn>It#CEdPQx5pf+1wA-90SCP>lQ>zBRyIz%+5#i76C4rV zOO;kHUF8FZ3-*=3K{{K)9-&x$#c$2kn6oJE1^K*VPDfuPO=IYZZ4zKkRZK+bVR3zX z3MNaMl<4$yYlz`{@gL#l6M3IpScQN?Rb6i+%Jg}OgkU_ zh(qr7nY*a28iocZiu~|rCMY{-LGfE+fjai!LKB>%_1&w;TtAjeZUj9za?JL3jiCpO zw^{M5;Cl$QjR;M4_{+nTE;RGVmt)cxJDp1$7Y-(hWQ67`xaij~(Qz9G{KQQtdn!smI2-C>MDO>}A%N?pO z)aBCr;#48x7T7HVr58+8^;zMk!Ysc{3uzIvYzyX+uyZtJDo-Zq7ij1KvM z-*D|EOA%_pqGH8;i4zC3tKC$mq#CA4%5RKOjyoUybou`Ahe)E0o_M_x@;2?fy(Gi@ zv^MBqmZ!2H?h?%sRBOVOe3ZU0X1+p&~}b4 zN-rEdc}m_T3lZJ2MS{o2Yj4DO@Z9nmx1E99S&B}I7f@|!g*Vdf-(6ok9}=b5d$z8t z<14HVx~v_p0>>-^#?Z~E%U$WM&ll!DeVe;xTt3~Xa`3HG`e3Wp;l*oN2uo*BS3I zG6QyrOci17+fDjtdc#P06y(NXX$|K25`u>Zf?SpNM)TwodL+m9jYTAmd4%60-RvTEjLp-)Ztx8J?x zbAn#)6QLfLD9i3%7#P0YX6$HwGqO!MGZsp~kt?u8KbG@p$IU|>v<|T_wCf~Lk3LQr> zwZbdp-_Vvo`+|&f4Q?0unVk!)Q@VlP&o2pe#jT*>o750Jh}9(%EN^aNG5uP4@I^Bk z=v_TxjQJc?tQ11DXyu8(G4a!acBKwLx&Huas@H?%vPtd_pR!R!CN$l2I4j{sxz0EDx;Iw#HeFl3h>XvT*Zj{$`}6%X z_3O?joSL^Gl+bv21B}?%7S4sv=Ins~C7hT3j1T9hs$df|g>r)waqTb1PIZj`I__Fq z{5|SS!*jDWo0OeL(8)U)0#`##4iS}3Ho#^Exz9s_b1AIWa=)_hPUH^1MyXI!{Iqi9@0|uKekvB7=%wBaI-9a**zE*NtXR7bxg%|+u z%DTmyoj}4F<^e7=N2No(YJQtgMfFJ zI5nL=Dvp0C*1u3w5X<$X{bQaw+kI8X>0XU4vz4mxT-oo85P?{teS0uhw1oJtbXF`9_W3XqmF2)} zc-k}GvcFqOtvCF>ip7;@htFQp4p7$jE*W5SX~rU&V5wz(AZzra16h-T9#d8zOH;AE z-=J`MzTp{pp>S{3;<*h%+Mw^Ix5+DNNr<|g&K0DXxU8Ua+AMEXH?=q~p}OK~3$-uQ zUw*FSSgpyhRe2ZX8)7cK0FnjO!CAG22mVQ5yfHv?+V9FPU1;!`2ij{zXOcMQE4Ebs z^v>8YEvY4gf;WqWgdJ;kMHNOL(f{<7g}RBI66Kvt!$fj}VVM?-4`cTqBtIiH-4i}q z{KT9V2nC<4{9vCbswIn09-f?MKQJueC|7u%2wZ@>DyParnL$cX zfCw%UAii4DX*=*hVNAQu{GFJ^f@K2)V@`QJl@1=LO9Tlw;y7QhC4>4!r9A*LrT3HB zYkC;l4^LOKSaqHTYR<)B&dWhIZQ z8N|nh@D~wJ3B74PH0d2)acC!$$+q^SO`aCM4uyr+qz@)rkJ9FfE+C8*^^gUFosA4U|?69NB(*^44RoN~3q^Szho44Hwd*Dz&^A zOo&)@!}8<1A)!8P=JMlrV4B&Rzdu{SbkE)%IqUJs!x2xY^pEuQ+!^aj!=T+#<%Wq! z?`7x>Oh=zj?Qhzq8Zb#q>C{0c@d$3sgo`~bN^|>k6l^lG)Bi5|NJoMpG$W{Eup+aJ ztNv?vzcTck0@ge|p{h^;4Aw9~qUIUj6XJ_}^!wqZ4dF!%z3`q|2S*)A07lQ`2wG(<5Szx&9sqf+bR`aMHGH#B3r;t>ul&m*#bTd- zcM-4FbKM@5`og38gTa%Ej=>IMz3^l@Qxp)rBb40w2Lpj-@9GOQ!8s^MLo{Pp`gC8( zJeL-5b^UQP`U}c0uQ7L4iu~cRIbWrqADruMzej7L)uOO7Quqq9*-YZ-59xu_p|=4u5a z`8yIuB&gq=_F}A#+c_oM1~3)3+{?vfKR8RY^-;*4iph&p2$lVdi|pzpd_sX6STL`= ztS82vUVR_*~?s{Og)^;Ue($#C&-{97O&b}2Sg*gtqDDPsc{0riaOzm}ark?MG}v(wN3)UXva zJpWQ`=BwERt%8&^O>}HLDc>hTKuLX~B;a+v{)t|9Rwl zI2j{zod02*PN*QG+ZOLue9{$jk{()$1l=xX)*^xPbbc=lCaInBo}v>>#9kWNr4Z8z zdwLxoEE*Q-U2ipKYj*^_rw#UFUl``lEfYV)%yQs39zw^k znRnIX(b^Wz+BeWs*?Avd$i`QZLG}q>+>IM(i=cFoRh<+p!^Gr9CI~Mv)+MtC0Rsx# z@d_I&Z8OAt@h7&r?~yR0L^y}ynn9b7usbA(;LHcrkrOAc#NTyew=>&e>vni!p?@BY zxyVN?H-zYJdcs0f^4t>D$v@9o>?%q&`I1@>NJ<7!`W$+^;=nuNt@dMqh{32496mOb zBp0#o>&NB`I`ziTIpEi}Tixo1E4QJz%`YZC%$cIV$cl_@2GwO@n_9bKy6I5rs+FCy(Sq%&Ny!@(P6e@Xp`R}GdVXb)6 zp!0asAm#EC)1Z&|aFY>l8Wg96Q#LkYANkWX2y|YCbL`%{SH0Szm?w{;0QFhrq#mk& zEYsp^v!_A3qN2fDik5%Z(gm8l{%gTXx%=_a;rc+4jZn?MWd=t~JPUX|9u;XC@<03d z{MWr^fT@We0m%6NV;%T!CE;50TFSf{kdt8GDn2^gg?_BB1%=zIj>mh`(}LPVwT8z} z4!^c9AOC70JYKzxnT{2G z`{!X~8ppvk!KXr>EDisrq{**;tb3GHENHq<2}Bh|annZMQ|fnMYA$}Zb1Q=vH%s%@ z4@R3~rsMAI^$P`}np;T@d31;2i%J99*XBEcg3uXjz`fQx^T#C?Ev8(sk6ULKy)DTi;`X9X6z130~>+xqodu{W!H6b z_z`UjY`6++pqtw=(q6mkGP-eGrv2w|33ak!4$E|Gn6z}62u4=I4yOc}Z}P{YWCNb~ z&s5t5nA`22GRVd_(``}WdR>Y^1}3N^l-g)IdC)^By{G2Uy(0Zm;T63!&)wKWxK33t z$8w#AXOv7)qZ7*X@`79y5fJqycH5y-3WHLm5=5ev=f6Dxv_i>x+*=mjzb9iswh&IO z*JYPE&NFR4|Fq=SDLI6pS0|G&S6*IaUqBaElOd6js^|^%>EdewB%_2aYE@cxIKjE<1|kWANC}~%7Kx_D?^>WS(&TGQ`4!8Qd6tmXqjT09=LEz|V()N{s(YPzR-=p~ zPP<477Lm-uv}XieuQZxoqV1<%AFUz2EQo3#vv&ErU2RGZG7UdP$89PJ4~+;rFIU27 z&V+al?kjCpQHCqAaKI3FM(q){*ZO;1Qhfo+ zh}xxfStU;n?DV(6Nlo9ycVZcPfW{M3HWqB$DiEEgQ7)#y*Z(!;^ZD%-tpmRLJ>cyG z(qZ3@zq8TkCGSAUvhQ)||GZ}`ug&;<&Y4G?9lkL_n3@QHutTm{w)ngE-2N-V7E;#5?n_N+9RH?{tPs;d7( z0t7|Ww5kt=88h@;B^d+k;e*@Kulm~}g}TG1nj93IvK%cvU8aznGFxe3Oi}v>R)Glv zg2uhb5Zjt{m>J?OaG>3%lxL+0v@&kvwWk-~=CxQ$iZ>?ie}>)DPW zZp+P~+nFso+cBmEWkvMqLi}ywQL7s+xW0^ksjIT1S|74*+V=7b znTKQ2?fBCArK0i-;amatWC`I{`_J^&C*$mQGm^*FvFYRZ?CUh#0nssCZ_=zxIM&0?n$UD~ zLluFNL=b`G2#I$gyh~Gv9H)_rE6^2I*Wdi5o5zWK>-AGBK}nITxGCqxa8pRdv&9O0 zI^9L%SEqTQ(?DXf*VQ}BQFw7%?EC}d*Cc|WMc41|%o02&hFs{Ygb#s-&*Jn$^jN;V zX1wCU!0DO((Gp`a{0Rpvl8|{(<*tuReRwYpPci77d58&o91Y>ax~Q-WktnjK2~rEc zO`&bRMZ@~G1GxX{Di&Vy$bX~ZBE|Q#yCr8Pi|6)*Qg4Epb6s~^u?Vy zYq8pezVS}VW_;3uuNvZ-KI-3GnEir$gYQ~~aUE#&D;XqHc*KsBD*BM$`=rB#e0(~8 z;)U)|ZQE*rL+SJ_QbAF9QM(kg>>h{0^=I(*TN5_hT@l(b;cD344tCx=Y@rj;_rYNu z%{WTM^B0_=Vy?S&BOF>oReqxM6%rHvlJsSc!Joo>jvusk)XD@ZpTD!#?YwvQY2x;9 zQ%vSK)qF(EIrP|$l2)2NhiRXM%6)wQH@^}>u>kUo`0 zUn^$C#7wZ}tyg-0w6g6GUyUPcZP)I|0EZ6?%eAJQf2}UHfK9PfX-0@ss_tDnnJifd zsJpGilV#uGvb|zT_`dv9N%;RwZOT{_hgW<~I~;Y}{louPaMI|l^nakydyV1bbOr@o zaYf7n#8n{3nvIlT`>QVNpE}YfKT-jy0c&|4CW; zkb9L}h`fL&-!vL7xhZ-D4sRy1EG+~3SnM5_v% z|679kTZ4R#H)k|gwfRuwJ<>)=(*`?jTaW8MPMHC19pWo+aMX>KFy`YLsQ(ByRYM|~ zo?TrD8&vb6$#uD*6nrOvIa&71>$r6R)k7e(($0<>VQcnh`+er3oma}XEk6GtBUFKt z>?`9USL{(s_q`5f%Ot2A5M=7}AERW1ws$WjcJ;Jcx(f_skBYGQx=a367@mYLC(C}8 zGiRa{gT}~Q`F$twagZR(nE7%|lOCp*wq`gfD&bwS{&7p^*YMq{3pZkGxSIE=k4p3r zmvi?jd&vLx1oFV%r^eppdo2;S*SI^^6ojTJX3PiFEW5_H*zU`?y!;oxE03E2h1GH| zyougOyfv+%Sq{pi(P3SevDfg}FKvha@l2yKMDbTrzx?^(HsezeCyS@ncu(R#BbkSX zqTeRefOd%03!!fxV|b}&(N>_!RPRB84DG9aAU}R8k{@>mC>_cU(mz@IJAhI&7v|=H z78vuL!$+6mM$R^fD%n1Rq~AJK+m%nO$)5oyI+e`};)QYsDqGAvTeE(D)5#FiCbPgx zrbO!jTFE}Cbj;dj4Bv!NTt9^uViEkXnVu(o$)21$=BLNEZH3>G*fD8*d_^u$zW??30zI2SOS?LMn~xyGdf=J{6&R`iJ>#YdO4#Ab_SP|n@2%;T6>dB zNdHkF#o{~#Y=DAnX)PnOGU@q9(Jkhnzl4SOZ$Se?t@prsw6u3hjULp_-yi#C*cs?) zntB(qC`P<8DGb-G%G$H~<3^Lz4qGhrmpkIxplE}o?zI+7Pf1rrlpxFRz21SmEJEv-BcW>+I2TsLbJ|P7PaD5V5t1t{p&o zg_v(v_RbcaNVT`~=s(OuTE(}nRjM;3;LLZPFFV8-q71&h^U@)TbKi31_~$k8w=Eb1 zThyVkT!d2{rDCN=yzfxNF&vGw;#S=z!2Od_ZpXqP^+3a>iT}6*vSh&I;`FP*{7?54 z7d>l;uf>^4Vzi6DT79Up8h>MGkN z6a2N=Qjs}M#`B;l*28yx@^~L`#mXtDUY6TOWQPAsp}+7yxS(RM`uq<9SQ4;;nn=Lc zf1-hvxcB4VM}V!*|G;&4CynfR4aD6Sdb+1lOw;rE4yPIU7KXOd59ea^>SEkm( zZ85YXCp%`zs|)pL4=aHw^}KxB&E8n5Vr^;9b7E@*wQ5?Q*OS9pb#mcgy1Q)P@~VQ! zf6N80k{~r&)We-$)A398SE5!*CIxik%7R>slz3L?5$}w}CRy$jU5J(>TiyCV@Sb*c zX8-$Ppm^9=igFXOy-B-_q6ekt3(yZyLbP;zU={Q`u+xUsw`lMQ90;pT36~qb*EIU! zvNO6SwbK5ff}}+UxV(V|%D1t6+>w4Q&|t;VEknVzgKr z<`a?ovza}<71g7ZM@IH7{2mRpRzwdq{S*JA2U?&Rv*lJt7qbUwzqDGf;uyb7vQMK& zUHp7U(Rt}q-^LeJ8>iC2CY}A-xUXT|o{L1>J(E#|h2Va0J_zEPeV0Zref*2dvF5c% zR~Q;ke7KEcOv@uohtlDDJJ%Rcy4%%=NrN$V7YKtsmGfu3DX= zM;7gx#)s+&sIZR|$lIVU{~<(lfKxn6VhzHGqTi|V2#_1l$Rx;;Qj8m-DR7s(fxaAR ztcH|%|LQA>z7vv|5Em-H*jHcG_DVu{tA3AGs){p5XJazma<*Q9w41Y1XcDfqhUOnUps_ z1FH?*eR3(QB)fH?p)buk%`A8;e=;{5Wtb6KPPW~rYznV97Zm$?$H((}#{>7CEe@f# zs}r#W5Z}!_xBYyPZYp<9J*exR6}pFvzCtT(^PTXOBcOjj_T#4ENP}3lEi$C z7Bp!UEq3(~Y!(Zxp`+0=r%CV4dXq9^Kl#O$Cm}XqI{nv{Ao@i?dq>SfR$1ZHGD{t8 zS3NTICl)!c3KO~_osWJ=RJORGe< z*98d>S>U5NZw`5?J!~<}Ty@1oV5#9@XUrD}`AyIuJ!k~USymB|r+RgHAWy2=5-j-}bVW4_w-d0yf3M4lw z-Gd973jef?c#A8)rMvxDtg-@AWAZtR>8W8DX0+VVX6+P}_Uv+kLVC$S>F2Cu&7`RI znF1psHbgm7vR6*I8O^tO_{7f+6ry11_|l%4)8~P2_L5=&@fu7n45|alYrCU#jAoDC zm*B39J$yf2S#Y{VZf5U>4pM1ts|OO9b-g5SO0zb3Iytl*YwCUr^J=~+BX>>{r4@MV z?4(F@sy2cEuqKqi$B}l(vqFu`o7Q|D{vhMWI({YK5BYna&yaD%vp0XC(R&gmy#wP! zZpaA2sNk6tA5hd95pTMi#U6Eh1>wO6cd^cSHndH`eM3zqGJD4oX3`F(j@9Kinxy%Q zmx6?@z=KM4Si9D{dKRhWv(6q`0P)uB(BM7uINTg+ct zP2c|&mCKt`Zb@T)4hna3dY1ndV^b^dsU&X@BLbmmEf@QhEEzv~n(UB2i0-DD(!6g% zh}^_bz^zD!og2rqPZYD*!(_rq(tXdg0SrSf^FtDV)7D z@UYM$>WBAz&YDYSh&p^|AmNXr;V*9aE5REGUM4uP?%%!Mky+u^;D9^K;C7$B5O2n)qD5^C^25 z%R@N&BZIL3+i?!FR6}ycEyUAasvB~FKm~mu=(wEacv+u@|8{rC4Bv-aX5ZH^?vk#X z+WlV=t+pS41D`0T9LjEWj(k{YV?lmDYsq53{0I*F0y;#KfR@-A`?!%$r9R7i0QWIf z%VL|jiHDgvlCOd#Vw)YqmN-!1tyeO44LL<~Uk_eCMpc!!rn8p6R(%zm7xFVhyhX^h zpUC1|s%LmwqcQ)OxQSgCS^Ce}~n$KYazze7iO-Up#+ z_eodYpLeI<*GB&{5(X|cNlB|RZ^8GPHkRA|-jofV`UsyS#1qsC;!fg+|0kETzp>N` zIf|dK|5btpVuZXS`>%Rp9V3B*&p_++fI{ghV9RIv zE-$mXrgOU!8&lNT_b`cu*iP&N!Swh#|LM;$*zL}w?*+egvUuU~a~qykFJKP*@i0(~ z?;)aQ53n&r#4>?$E-5Kukvi#xs{o*?>&( zPS|7UZIKH#vEAhf0&cIzAq++PJtw6vFVPjB@Hzc0qZyjhdMrJ>r?jLP+BZAmF=nOslrJa zcQV&{dt>-@Ky%>*iv|@qJ|-WvcKI&sMrG$b0imgm86%7fWjurl%AVWzUp5uIgD%qO zX(7_1-zZ`?9yGWABn1Iddqf@xik7}cU$HH7&}EL`8wYr%T)$T8CH`kt%W9KDJSC)xPAAStbBd8qAST^YU~b@xNE z0ot~rP=8NJ-(ARrLWaW+o=UP!UttXN@eHp&Ut6qmx7(!NO%OSqS|vkv;aJk9=4|efUO98F*i}_x3B}(+U^T8l>Rz zTgh;l&mIW`aKzQu$_l*Chv(j=`z+9f<5=EHZY9>ZO4Cj9bvorNv$&Hwts1i#&7paOB}FEGF!2k};0 zzgv|M|B{`*c*aNU;@WG_?R}*xZj;6HkCpmI8|*LF%N(JEaFGFWq2%!0XY+5tSn2{eg$%e29c&ws9!*?>DV!@WzzUQ7D=)ec#42hzgsInx+7K($~ zO({L9F?bBrxN$zcB+UwEUtKDDY?*82lMju#<-cY`Dj$eD@_Q9@zR)2t*)j6oq`C=D z)!HY{DL0oPM2|fsnlE^bEcR1TV5;H)e6OTST1o$qT;Rhypo-M|do($Cm3fMtIbb3G zA7mx{CnWiV=!u}@|7EhK^?N|p1e^9DLvwJ>a`M*m9l+V#Tie<->Ll}tC3GLWkIey~ zOIM-VNUBzj%9|5K_9(Af>e+6d#ypsd~6|bBmV=qcv zSqaqOTSCu{is9Bq8?dzWdY!sNu9Px*m88Z)u((Wdz!v`%Bl9G#Q3vqFm7mV7UN?K; zFZ=^B{xwjD-+KR?z%u7#lh}Ye+1Q;G!j8iJP^m$5`+#&!sc>|=`24i=ihX`?+lShk8Y+usgB-6eS>^B;v~ zM`1f6LBadV4chI|y{P9+uddkTY12K98eTJcsxZh1?CaPbeU4OP?U-bE79H_A0$P9? zULn+7T$$Y@FiV`_CCk>>SNgf0VZu>dTff%+MEu)QB+k65Mw{imJ$IYP4<#qp3RU|x zsFa_B>V^7?=?pHwpk8$CrwNGDMJ9z*O@+Q_C=3E8D6Y?izr@!(2;S3uH4E!23~oTTo&oz?l0{Cp_}|nZx5`C~LG|w@Ul1N2Tf0@(^q? zj(c!;i@s4v?14|l_E^Lmd|;sz^}PihClCZ3Z$qt~@ww~=cu+c?QJh)c?+odLe+Uo{ zs9z9Wvy#-pp-&}*0J2m76d1eKJ2OMny^wVEM_SlHuIJnXqJ)rrf)(4!eu4$Uaht6! z#;{^~dj!($HUXgO^m~x?mG{n$ksdk^thE!+Ujpke%m_h0lz$prUC6=JCDZW!!SP~3 zyW@B$oF~Y{EVSGFS-cGvYq{P$Fk;mk zxwUa?O@gTv9o=)O+3~0R9(RbnK*cmW$D*!kSq)0%g&MRZYo0j``jSmI1=R?N6xO1)I35lqYv^j^gY5C$E>Jjw-0-_)WqzMQ?V?aU=3Q8|ZCqOs?0qG@xgeoGS^iF6ZT?j>b3n0=7M5&=k4TL7W zNKxL#?|$ET-@W(UH^v*|<&Qlw##(Fdx!0a+{^tCxwdPtF=WLKbi6S8Oe~oT$-R$>@ zKIxV?LKX`^N;GPkdAqXs#7rDkJiqgnSyY1%dn32d-+ig>skw1GF2s{(A4I%B4^+h7 zWxe- zi4!_CksadED3g+M?F(`gP4D2}`Y4lKWO9Vm@ic(N4`R^++b~NkjATQ=Nn5jeN$rkJ z@hur)Zkr}=q2(&DuozjJCH3rK>coV&pEJsi?L0Bn*)Fan|cz)R-{2e6unIngA9 z+xHf0_a17NPeq%w{QW)Pu3n$a2pUtzD#%zT|F=Ci!TmP=GVhYOX7%e4Ngix`#L4Lq zOmk}&@yZL|&TPj&7fC`}#8(kX!!6C4xykzBHKreJa~`!gDx%AW+MOmpKMm{XnNAI7 z@$=f4g&QK(4~%<5164*Xh(j*oAl1$NLUr-v>mt{>f}$t?Dg$WWmx14+D#vUPznWDNT>WRZcyQ>ycf&nejzAvEW*j`4JN$F@( zQTWpAZrdy4F+t`xsz%n(i~GR(zGmV|)W}h%*=*y+Q2ROkHb*O$^5s$gQhZ=ml<)o~ zzD(F^u1z*X#&To!h8M>j9kf|{C;BZlrM z0LkZ<>MA|A5Q!L&o~O2Y#X*JS3w!0GgmDL_GoK=xMN)69e^E=m$09;?7CLYN5@gs3 z#s*@S0$5svpq5Xxk;u~K^Hjn2;L(1*4fuQEEowbfLJT548)n!*{42vZNpKHn&lPiO z(fRpnAY&@0i$d;yF`H3aa#BduK1Z=K!C@D1yIuvsqT@M3QW zb_u)_9Y}Zg^;hWUxW0cY`MY{jqa26f$h6!=b{;iWpv~VU|LI0;Q`3j*sHG>ksdP+$ z{?q5%928`x-{+$@>+_?vm$_VRlrQIzn$ou>G!C`F`VuGn_&uUgoRbH2;#O|`n z6hDuqv^g|hw=QAZGmAyT9F5@N#4J^FJ^`8AOQprVEgqZuQQ@rknz%%LMAokl+g9#H zadDLy#P9|2y6`eAblWyHku*0WF=nTvyK-J8L^Bv(*5l~ZX8OeUSsInCORaLBpvcx* zN?Pp0OXoJF*z#aG0oAXKBdb^V*TNaVd@8%q7lSl zSR3kMnSR;FqSh`zRnq81uB*eCL79|)kuZ3@aa3Zd^DEMZry+6Gz(f}9`1-4xZc1mU zF>Ogsae~!wrp-;UF$UA9E?JGq%&+YYz1I(7HkC^TYV7BQ+tW(R(GR$Wf*fRfuk|oR z^0SL@vl5d1j&(-iDmL?~5SCXpSYy0EaSp$+c(u}(z5x&Ea6I^ti-a5iy9o+BMClTKkHpkw%6Rjm`%9gI7 z;yc#!I9`6tcCPYJvVEl@IU-4ScA!rTWhW2voQ#Lr{#N&Hv!Dt0_}%EfC|S=*{6g0T zqRUo3KXGocs-V4RxpWv5OB!xBXnY$lb8g72yY^QhoPEu{b^~Nwr}onzy zOB}VR?(lrTxBgpiD66%f4ZsDyTX|@F)?ko_5~-(8z`~sGu*u`a*Y$f24d=_nf*%!a zKeXLar?(tGcf)xL&@64TWJnn%UWbs#m0nC%w9`&YteQwQ*4_1V)$eUr6C9sa7^c|R zg-+Q003(mFpF@xOed1s=k18FT4g3l*=pI64(Ig_DvR=fdSe+P=q(1vrcz8ClN+^Fe zg$F5JMlBwz8Ql2M*g620aIr{Oanh$8opCM?R>SvO+oiP=y6UVuaR_>uEc!=A4b$gk zL=-7eQqlZ$9-eNOm)gQ?yh1@zP4Bf;I>Lmyi5qSQ<@pKv$i_Fn+VesKojV*Af3khk zD~IyLTwt#y3d$VQww*^Fe~c;BDA_bKt&3E!`03o4qI(&$T-n#{?CLd&Wkn1WyHNC* zI3_(_%&l6$*$-P>0_lo5zcIQy4+xfTjS-2X#PW{*nPrcKQynS5cG@=<3%Y187fG1L zC9!GRRC0~ciYrN+=mQH)^uI2V0+^FhW2*r0FRhI7%vA^puqCGgYgBhsq5o+>%FuM% zOaX=SHbp&$k8$BWG=h^sYY@OxvvEX_oCla**hkp3zm7q&b&9rS{#BI7=j4j)5^p29 z%e;iy`&f_{`p(1c$NMl+%aldqn9WV%c&WR&(yfE*W1Tl!B~KN2WDKy^#WpKhA$2Pw zDovM5;&0gB(na6+_E^%=({`*+0^w!5P@7-K$YZs$Mu4le?@55Rvez0V(~xQ~a| z%}|q$NNA69TYSti!6M*vh^eOGXeOmdj&rC>BSg?zQ&!M2%m7emA)USk#Ld_)ihKV zfB&g|ulsvCagNDb*E)n#A+nuH;-TW#t_vGF220+x&@ePrO|x%|k{tT1Ec;JcE(6%+(5CYq&gq$v!Up{Emyi=31l+itlX7q^$iSAne*TPogl!td zYS;RKIEz?WnJsEMJHpO4x`t|>TZAo*z}V&I*utGYZDMBBOcVEzLR)bruT?{fz_0vF zXbOye)Y{rFsV~$~DNgg@k`dnz6u0KKqBI@-gVK*P88w{ir@h0sRWL>^Ehq+H_z$C( zEc*h7N<0$HlB~2I&K!^uUn&{ zth?(UwrkR|$#%k(os<>oJwo&=r>IlomuE{P75|8%K(H8|xQ5p7GPiOe-TUZ3HF7I) z?wNTIrs>wosRYzp1>@AxLVyIiK`gR&f#;4ZN??$s8q{YXYjg8M_9vbARoWo2QYE@K zt{;vzVq$cV#ct%L(ySai89h?>10ku1GjJKI8c@#6o43UW>uO_3U3w$i^)ToC@t51P z>RQpuY>W))j_Si1y>~Ro21D_eydxEBHP%WymSBmWth>w=idg^R`7O+EWeJ}Dhq3@4 z1|KQChyAk68%+tnutHvc%q-$j8k(qI{cXCx^+IH&JdPexFiURJcf2t(H@kX7{ z+PNj{Ah9J>Lr#}s^6N5*aHvS$t!#Ur??v@Vy^A&v`$RQ1&e()@P?6T%i1zl9t*8B` z+UQN!{9Z=Jx4gOuLv$#4ri00LxK-!uhBuQ0*#fF~1hqAg)LfZ1F!BKZPFYki^ffSU3UyJTPu8ewj6P z23;uz*^;;UD(@P_?Nh zHMRINMR|!|D$i1Ri>b669D8C^4o-IB^G8a@WxLkU(I9+UbfPc>#QnJ?6 zd`)WgU`1k0JXcbcJCPky(M>fi6qjpEAgL1W=f2xxLv_<=ky4IlCM}?I5h*M*;iV-= zrYEswYO@53JF!xwn@U${KsQ5Z>^Qn+@B7vr6{FkwH@k$sT5@C{8cy=cy0=;6|)X=n`o37=zvOEt~RFK$n!-Ru|cML0sVV*PNm8js=1^K(~! z!a*JJ{-us>32eETK#BRyPIF6EVDDSUUSAOT0m^=u5a@{kJLU+@&vLgfry5@op;_`* z{Do_0f&AJV0s2n008`mVOpg;Dwn8INH;hVL9BaVeu}{rMwnlCYe=%})2acybT=ew| z0K2{UKEXXh7LTt6I!(~G$S){XV)f@Or(!I7g9fE2&Y?ygBXx<+7ti#--FkLJ? zhqy9;~ih7X!t8Bn9MD< z`I|~Sg$6^;m`B$0kqqdP%7}Pr)=>+m<=r*97d!-9_~i9J&ou(aZyp>x0$FWYmqcpR zHR~GDR-X17If%)n+9o+M!J3SRT6>dcI?ZyomHRbpSLH`7Ku71Qg|_f@PJAKQXS&E~ zNYuG&>2Yj}vbUCupVCskp9L9HEMS&@V7^AL6jlXu0Ima%d4oSoy@ij(b4n!YORQ24 z;S8i`Fr{8>;tFeU*s?*SXJO_B8d5CXB%7$o;&W9Ta0dGgjkJ>5v~aXk!~oIM)rSgO zljL1qloWwqqYE@^(O?pn;*F%X;tov`7yHnqbv&7-h%lFBmURhl^u@@06n|NtWzGk( z9ob;^i^iJ6@&rQ%`@(U0lg##=4T84vHDEM!uJlJ4Q4#;cD}ZG`e?iy3H@#gFwm^R1 z7deb}VOdjT7DISOVt zxart8EcBbGSn`1ZVf;ZQo%l6j_+9tHZ`ZHjgDvnsq{=&Jk;Ylc`sX!%=?s_F=BO5#;DOapO*7m> z0Y3cS81%Pae(2%TO3gyWb~cw5@Xh~)LI3XZ`!+N*_%t*&w$t6sfB7$Z?)BHC-ObHt zfg@^C$%zjaz<1orol8^vz-LB%O2>u5J{B85nsl$X@v1U`Wh!a1 zOe6&#-=BvaIF=pF=R8&0h08pzZ>q!)4#l^F6dXu+UX`Pu-K*OC^&0sHaD_3&VCBPa zQtQME>nq~tyHcUXxo*!uC!Q23=irFwFnM>4cK?Z9ot?gdVM&}WpUuIdli}AW&2i5k zJ2NqjyTp2Df0isx*5Fk(q;cn7UwV|PK$VfN)GP)j<`?W~G_C^76d%LDwhfA|Z{(VS zIf~!Cn(VF3PJC~LcHmGvsr$ld!mm7L9UsV2+Tg0O+$|Bk4==t`o0n7CJoKkolAX@0 zw!;}ejzC}J;MXYKH78F436+_jT86K)G^C9FTuM-q-)*8?uchqKDSpZ7T->Ea#- zikBQ7JWAT#S5E`pZSzJ1EB<;>84xQa_WNA*QJ>cmm3`&;jq-l@8Ne(h@EsGr?*!Z0VEnLK|4P=?^=CD90fZE3 zt*zbSU9w4RS}d?lZD`#I|BKt1d;E&0%w(}Cld#QJ1Lx~vp01`H#Fj+LRbIuk2|JAH z_ZvkX=}#0PZ$e#;;5&~ZOeb_DEguQ*3B&z9V^QYj?Sd%~1PZ}Ym0_;K?B%>>Ncm9l z;8AsKIO`XA-@2gf0__Cy;Uc11cX;PTPE*>2uTwM-g>rzjDf7t|P3F+!dB;Bff&7@@ zI4`41>RIIb6UyXkr!o~6v3yQ_(Kbsz?d@`_%9NLY+TDRaTc_J7jHDa>HOOvh!%bpjWb_^S0Bl*z?vH5ao$f01!^El#Lcei~G+K5Pt4xFTS zQBQ8i%E*Ysou(J?6#7?WrVcy1xb^DOH}V@$Ks;}k)Z$xL|5g7^6B-(!&nyyiL^L>Y z4PO{~ZWtG`FQo5@QpRb%K(`IH&znXgMD!)&_E6t|b`A~& z)KN$Yn&S1h`*`-iZ1QOpZdiQNz=-_&JBWSC_j%f2oF_km`sWq4bo9te`e@J`*nK%ri*5jhzF+SJZzBMZ~X zJm775UeS~$AFR^eBW2pd=n0H5oZ3paz1<003sH0N$r6?^eI}hh{ss}(T$Yf&kB3*K zxa4HmG<>}Y_rY(N-3Q}qN)Sk1`%>JsL+93Qn_4dWz2VQBi1kT5K90W4Kj*Ia*7Spn z%aX2ke7$Xd&oIYPf2bQl8tRV8Hp$Oh=Snf6OikPH%B>?bCq+jkq?AX^V^<}ULfe?- z7WuA3DwUYSacuGU9q-KfJKQn{swQI9Z5%fC*Gr>`L!Z%#yq_<=Vp zqDZ(1T){#RP?-r#{Lk6QBEwDoDPqmm()Nm8#`iD?G!0h zX`QjH9D}06raMQsnJ@7Gqy5&;wY2?(i!ype&V#7OQY;nGx9m|A-Jd(}1A*c5!dvEJ z)Jgu>VP|JAP9;gE_yB_|tampaZ|atQ`Aen0dJ-dMd3U2PGmU0UDW+;;_DJ(* zvtNP4_zx6ikXv-*1wed%#}V~eK~_}+8W8mcyR?Gvd98WE7OnpqqeXmPPjNr}uP9C9 zGT)rCq yhx@@o^{J8};0)IP|J}p+zmS^RdwnF*$(aZJv9lZL_bF7sPx-ONqhh(gUi~k dict: - data = {} - input_dim = self._event.args.get("Input Dims") - if input_dim: - data["Input Dims"] = input_dim - input_type = self._event.args.get("Input type") - if input_type: - data["Input type"] = input_type - return data - - @property - def data(self): - return {"Input Data": self.input_data, - "Host Self Duration(us)": round(self.host_self_dur, 2), - "Host Total Duration(us)": round(self.host_total_dur, 2), - "Device Self Duration(us)": round(self.device_self_dur, 2), - "Device Total Duration(us)": round(self.device_total_dur, 2)} - - @property - def info(self): - return {"id": self.node_id, - "node_type": self.MODULE_TYPE, - "data": self.data, - "upnode": self.parent_node.node_id if self.parent_node else "None", - "subnodes": [node.node_id for node in iter(self.child_nodes)]} - - @property - def is_root_node(self): - return self.node_id == Constant.NPU_ROOT_ID - - def update_child_nodes(self, node): - self._child_nodes.append(node) - - def update_kernel_total_list(self, kernel_list: list): - self._kernel_total_list.extend(kernel_list) diff --git a/profiler/module_visualization/graph_build/fwd_module_node.py b/profiler/module_visualization/graph_build/fwd_module_node.py deleted file mode 100644 index 34d7ab82964..00000000000 --- a/profiler/module_visualization/graph_build/fwd_module_node.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2024 Huawei Technologies Co., Ltd -# All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from profiler.prof_common.base_node import BaseNode -from profiler.prof_common.trace_event_bean import TraceEventBean - - -class FwdModuleNode(BaseNode): - def __init__(self, event: TraceEventBean, parent_node=None): - super().__init__(event, parent_node) - self._bwd_op_list = [] - - @property - def bwd_op_list(self): - return self._bwd_op_list - - def update_bwd_op(self, bwd_op_list: list): - self._bwd_op_list.extend(bwd_op_list) diff --git a/profiler/module_visualization/graph_build/prof_graph_builder.py b/profiler/module_visualization/graph_build/prof_graph_builder.py deleted file mode 100644 index 83331b62502..00000000000 --- a/profiler/module_visualization/graph_build/prof_graph_builder.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2024 Huawei Technologies Co., Ltd -# All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from profiler.module_visualization.graph.prof_node import ProfNode -from profiler.module_visualization.graph_build.fwd_module_node import FwdModuleNode -from profiler.prof_common.tree_builder import TreeBuilder -from profiler.prof_common.trace_event_bean import TraceEventBean -from profiler.prof_common.constant import Constant -from profiler.module_visualization.prof_parse.prof_data_pre_process import ProfDataPreProcess - - -class ProfGraphBuilder: - def __init__(self, prof_data_path: str): - self._prof_data_path = prof_data_path - self._prof_data = {} - - @classmethod - def _create_event_bean_from_ops(cls, op_list: list, name: str) -> TraceEventBean: - min_start = min((op.start_time for op in iter(op_list))) - max_end = max((op.end_time for op in iter(op_list))) - # 以反向算子的区间作为反向module的区间范围,为了module包含算子,做了+1 +2处理 - return TraceEventBean({"ts": min_start - 1, "dur": float(max_end - min_start) + 2, "name": name}) - - @classmethod - def _trans_flow_to_dict(cls, flow_events: dict, end_events: list) -> dict: - end_event_dict = {} - for event in end_events: - end_event_dict[event.start_time] = event - result_data = {} - for flow in flow_events.values(): - start_point = flow.get("start") - end_point = flow.get("end") - if not start_point or not end_point: - continue - end_event = end_event_dict.get(end_point.start_time) - if end_event: - result_data.setdefault(start_point.start_time, []).append(end_event) - return result_data - - def build_graph(self): - self._prof_data = ProfDataPreProcess(self._prof_data_path).run() - all_data = [*self._prof_data.get(Constant.MODULE_EVENT, []), - *self.find_bwd_module(), - *self._prof_data.get(Constant.CPU_OP_EVENT, [])] - all_data.sort(key=lambda x: x.start_time) - name_dict = {} - for event in all_data: - order_id = name_dict.get(event.name, 0) - event.set_id(f"{event.name}_{order_id}") - name_dict[event.name] = order_id + 1 - root_node = TreeBuilder.build_tree(all_data, ProfNode, TraceEventBean({}, Constant.NPU_ROOT_ID)) - kernel_flow_dict = self._trans_flow_to_dict(self._prof_data.get(Constant.TORCH_TO_NPU_FLOW, {}), - self._prof_data.get(Constant.KERNEL_EVENT, [])) - for start_time, kernels in kernel_flow_dict.items(): - matched_node = root_node.binary_search(start_time) - while matched_node != Constant.INVALID_RETURN: - matched_node.update_kernel_total_list(kernels) - matched_node = matched_node.binary_search(start_time) - all_data = root_node.find_all_child_nodes() - all_data.append(root_node) - return all_data - - def find_bwd_module(self) -> list: - bwd_module_list = [] - fwdbwd_flow = self._prof_data.get(Constant.FWD_BWD_FLOW, {}) - module_list = self._prof_data.get(Constant.MODULE_EVENT, []) - cpu_op_list = self._prof_data.get(Constant.CPU_OP_EVENT, []) - if not fwdbwd_flow or not module_list or not cpu_op_list: - return bwd_module_list - fwd_tid = module_list[0].tid - bwd_tid = fwd_tid - for end_point in (flow.get("end") for flow in fwdbwd_flow.values()): - if end_point: - bwd_tid = end_point.tid - break - if fwd_tid == bwd_tid: - return bwd_module_list - # 将每一个反向包成一个module,名字叫“nn.Module: BACKWARD_0” - cpu_op_list.sort(key=lambda x: x.start_time) - pre_status = Constant.FWD_OR_OPT - bwd_op_list = [] - for op in cpu_op_list: - if op.tid == bwd_tid: - bwd_op_list.append(op) - pre_status = Constant.BACKWARD - elif pre_status == Constant.BACKWARD: - bwd_module_list.append(self._create_event_bean_from_ops(bwd_op_list, "nn.Module: BACKWARD")) - bwd_op_list.clear() - pre_status = Constant.FWD_OR_OPT - - # 通过连线匹配正向module,构建出反向的整体module关系 - root_node = TreeBuilder.build_tree(module_list, FwdModuleNode, TraceEventBean({})) - fwdbwd_flow_dict = self._trans_flow_to_dict(fwdbwd_flow, cpu_op_list) - for start_time, end_events in fwdbwd_flow_dict.items(): - matched_node = root_node.binary_search(start_time) - while matched_node != Constant.INVALID_RETURN: - matched_node.update_bwd_op(end_events) - matched_node = matched_node.binary_search(start_time) - all_nodes = root_node.find_all_child_nodes() - for module_node in all_nodes: - if module_node.bwd_op_list: - bwd_module_list.append( - self._create_event_bean_from_ops(module_node.bwd_op_list, f"{module_node.name} [BACKWARD]")) - return bwd_module_list diff --git a/profiler/module_visualization/prof_graph_export.py b/profiler/module_visualization/prof_graph_export.py deleted file mode 100644 index d336e97f741..00000000000 --- a/profiler/module_visualization/prof_graph_export.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2024 Huawei Technologies Co., Ltd -# All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from datetime import datetime - -from profiler.prof_common.constant import Constant -from profiler.prof_common.file_reader import FileReader -from profiler.prof_common.path_manager import PathManager -from profiler.module_visualization.graph_build.prof_graph_builder import ProfGraphBuilder - - -class ProfGraphExport: - @staticmethod - def export_to_json(prof_data_path: str, output_path: str): - logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") - try: - PathManager.input_path_common_check(prof_data_path) - PathManager.check_input_directory_path(output_path) - PathManager.make_dir_safety(output_path) - all_nodes = ProfGraphBuilder(prof_data_path).build_graph() - result_data = {"root": Constant.NPU_ROOT_ID, "node": {}} - for node in all_nodes: - result_data["node"][node.node_id] = node.info - file_name = "prof_graph_json_{}.vis".format(datetime.utcnow().strftime("%Y%m%d%H%M%S%f")[:-3]) - FileReader.write_json_file(output_path, result_data, file_name) - except RuntimeError as err: - logging.error(err) diff --git a/profiler/module_visualization/prof_parse/prof_data_pre_process.py b/profiler/module_visualization/prof_parse/prof_data_pre_process.py deleted file mode 100644 index 9dc820e4ca5..00000000000 --- a/profiler/module_visualization/prof_parse/prof_data_pre_process.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2024 Huawei Technologies Co., Ltd -# All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from profiler.prof_common.file_reader import FileReader -from profiler.prof_common.constant import Constant -from profiler.prof_common.trace_event_bean import TraceEventBean - - -class ProfDataPreProcess: - def __init__(self, prof_data_path: str): - self._prof_data_path = prof_data_path - self._trace_path = "" - self._kernel_pid = None - self._result_data = {Constant.CPU_OP_EVENT: [], Constant.MODULE_EVENT: [], Constant.KERNEL_EVENT: [], - Constant.TORCH_TO_NPU_FLOW: {}, Constant.FWD_BWD_FLOW: {}} - - def run(self) -> dict: - self._check_trace_path() - self._parse_trace_events() - self._check_result_data() - return self._result_data - - def _check_trace_path(self): - if os.path.isfile(self._prof_data_path): - (split_file_path, split_file_name) = os.path.split(self._prof_data_path) - (shot_name, extension) = os.path.splitext(split_file_name) - if extension != ".json": - msg = f"Invalid profiling path suffix: {self._prof_data_path}. " \ - f"You should input in a json file path, such as trace_view.json." - raise RuntimeError(msg) - self._trace_path = self._prof_data_path - return - ascend_output = os.path.join(self._prof_data_path, "ASCEND_PROFILER_OUTPUT") - profiler_output = ascend_output if os.path.isdir(ascend_output) else self._prof_data_path - json_path = os.path.join(profiler_output, "trace_view.json") - if not os.path.isfile(json_path): - msg = f"Invalid profiling path: {self._prof_data_path}. The data path should be the " \ - f"folder that ends with the ascend_pt collected by the Ascend PyTorch Profiler." - raise RuntimeError(msg) - self._trace_path = json_path - - def _parse_trace_events(self): - trace_data = FileReader.read_json_file(self._trace_path) - self._check_trace_data(trace_data) - iter_trace_data = iter(trace_data) - for event in iter_trace_data: - bean = TraceEventBean(event) - if bean.is_optimizer(): - self._result_data[Constant.MODULE_EVENT].append(bean) - elif bean.is_cpu_op(): - if not bean.is_step(): - self._result_data[Constant.CPU_OP_EVENT].append(bean) - elif bean.is_nn_module(): - self._result_data[Constant.MODULE_EVENT].append(bean) - elif bean.is_torch_to_npu(): - if bean.is_flow_start(): - self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(bean.id, {})["start"] = bean - else: - self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(bean.id, {})["end"] = bean - elif bean.is_fwd_bwd_flow(): - if bean.is_flow_start(): - self._result_data[Constant.FWD_BWD_FLOW].setdefault(bean.id, {})["start"] = bean - else: - self._result_data[Constant.FWD_BWD_FLOW].setdefault(bean.id, {})["end"] = bean - elif bean.is_kernel_event(self._kernel_pid): - self._result_data[Constant.KERNEL_EVENT].append(bean) - - def _check_trace_data(self, trace_data): - if not isinstance(trace_data, list): - msg = f"Invalid profiling data path, this feature only supports performance data " \ - f"collected by Ascend PyTorch Profiler." - raise RuntimeError(msg) - iter_trace_data = iter(trace_data) - for event in iter_trace_data: - bean = TraceEventBean(event) - if bean.is_npu_process(): - self._kernel_pid = bean.pid - break - if self._kernel_pid is None: - msg = f"There is no operator on the NPU side for this data, please check whether the NPU switch is enabled." - raise RuntimeError(msg) - - def _check_result_data(self): - if not self._result_data.get(Constant.CPU_OP_EVENT): - msg = f"This data does not have any aten operator, please make sure to enable the CPU switch." - raise RuntimeError(msg) - if not self._result_data.get(Constant.MODULE_EVENT): - msg = f"This data does not collect any modules, please make sure to turn on the with_stack switch." - raise RuntimeError(msg) diff --git a/profiler/test/run_ut.py b/profiler/test/run_ut.py index ee27abaace1..6ab208dc29e 100644 --- a/profiler/test/run_ut.py +++ b/profiler/test/run_ut.py @@ -13,6 +13,7 @@ def set_python_path(): os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor") advisor_backend_root = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor", "advisor_backend") + profiler_parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Update PYTHONPATH python_path = os.environ.get("PYTHONPATH", "") if not python_path: @@ -22,6 +23,7 @@ def set_python_path(): python_path += f":{compare_tools_root}" python_path += f":{advisor_root}" python_path += f":{advisor_backend_root}" + python_path += f":{profiler_parent_dir}" os.environ["PYTHONPATH"] = python_path diff --git a/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_dataloader_checker.py b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_dataloader_checker.py new file mode 100644 index 00000000000..3d8e22b7c66 --- /dev/null +++ b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_dataloader_checker.py @@ -0,0 +1,65 @@ +import unittest +import os +import sys +import yaml + +from profiler.advisor.analyzer.dataloader.dataloader_checker import DataloaderChecker +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env + + +class TestDataloaderChecker(unittest.TestCase): + @classmethod + def tearDownClass(cls) -> None: + recover_env() + + def setUp(self) -> None: + rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))))), + "advisor", "rules", "dataloader.yaml") + + with open(rule_path, "rb") as file: + self.rule = yaml.safe_load(file) + + def test_no_dataloader(self): + dataloader_duration = (self.rule.get("dataloader_duration_threshold") - 1) * 1000 + dataset = self._get_mock_dataset(dataloader_duration, is_empty_dataset=True) + + checker = DataloaderChecker() + checker.check_slow_dataloader(dataset) + self.assertFalse(checker.dataloader_issues) + + def test_no_slow_dataloader(self): + dataloader_duration = (self.rule.get("dataloader_duration_threshold") - 1) * 1000 + dataset = self._get_mock_dataset(dataloader_duration, is_empty_dataset=False) + checker = DataloaderChecker() + checker.check_slow_dataloader(dataset) + self.assertFalse(checker.dataloader_issues) + + def test_found_slow_dataloader(self): + dataloader_duration = (self.rule.get("dataloader_duration_threshold") + 1) * 1000 + dataset = self._get_mock_dataset(dataloader_duration, is_empty_dataset=False) + checker = DataloaderChecker() + checker.check_slow_dataloader(dataset) + self.assertTrue(checker.dataloader_issues) + + desc = self.rule.get("problem").format(dataloader_duration=dataloader_duration / 1000, + dataloader_duration_threshold=self.rule.get( + "dataloader_duration_threshold")) + + self.assertEqual(desc, checker.desc) + + def _get_mock_dataset(self, dur, is_empty_dataset=False): + dataset = TimelineEvent() + if is_empty_dataset: + return dataset + + dataset["dataloader"] = [TimelineEvent({"dur": dur, "name": "dataloader"})] + return dataset + + +if __name__ == '__main__': + tester = TestDataloaderChecker() + tester.test_no_dataloader() + tester.test_no_slow_dataloader() + tester.test_found_slow_dataloader() diff --git a/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_syncbn_checker.py b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_syncbn_checker.py new file mode 100644 index 00000000000..d1df810a0ec --- /dev/null +++ b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_syncbn_checker.py @@ -0,0 +1,62 @@ +import unittest +import os +import sys +import yaml + +from profiler.advisor.analyzer.schedule.syncbn.syncbn_checker import SyncBNChecker +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env + + +class TestSyncBNChecker(unittest.TestCase): + @classmethod + def tearDownClass(cls) -> None: + recover_env() + + def setUp(self) -> None: + rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))))), + "advisor", "rules", "sync_batchnorm.yaml") + + with open(rule_path, "rb") as file: + self.rule = yaml.safe_load(file) + + def test_no_syncbn(self): + dataset = self._get_mock_dataset(1, is_empty_dataset=True) + + checker = SyncBNChecker() + checker.check_syncbn(dataset) + self.assertFalse(checker.syncbn_issues) + + def test_syncbn_not_reach_threshold(self): + dataset = self._get_mock_dataset(self.rule.get("max_syncbn_num") - 1, is_empty_dataset=False) + checker = SyncBNChecker() + checker.check_syncbn(dataset) + self.assertFalse(checker.syncbn_issues) + + def test_found_slow_dataloader(self): + dataset = self._get_mock_dataset(self.rule.get("max_syncbn_num") + 1, is_empty_dataset=False) + checker = SyncBNChecker() + checker.check_syncbn(dataset) + self.assertTrue(checker.syncbn_issues) + + desc = self.rule.get("problem").format(syncbn_num=self.rule.get("max_syncbn_num") + 1) + + self.assertEqual(desc, checker.desc) + + def _get_mock_dataset(self, syncbn_num, is_empty_dataset=False): + dataset = TimelineEvent() + if is_empty_dataset: + return dataset + + dataset["sync_batchnorm"] = [] + for _ in range(syncbn_num): + dataset["sync_batchnorm"].append(TimelineEvent({"name": "SyncBatchNorm"})) + return dataset + + +if __name__ == '__main__': + tester = TestSyncBNChecker() + tester.test_no_syncbn() + tester.test_syncbn_not_reach_threshold() + tester.test_found_slow_dataloader() diff --git a/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_synchronize_stream.py b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_synchronize_stream.py new file mode 100644 index 00000000000..360363ce371 --- /dev/null +++ b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_synchronize_stream.py @@ -0,0 +1,55 @@ +import unittest +import os +import sys +import yaml + +from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_checker import SynchronizeStreamChecker +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env + + +class TestSynchronizeChecker(unittest.TestCase): + @classmethod + def tearDownClass(cls) -> None: + recover_env() + + def setUp(self) -> None: + rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))))), + "advisor", "rules", "synchronize.yaml") + + with open(rule_path, "rb") as file: + self.rule = yaml.safe_load(file) + + def test_no_synchronize_stream(self): + dataset = self._get_mock_dataset(1, [], is_empty_dataset=True) + + checker = SynchronizeStreamChecker() + checker.check_synchronize(dataset) + self.assertFalse(checker.synchronize_issues) + + def test_max_synchronize_stream(self): + dataset = self._get_mock_dataset(100, [], is_empty_dataset=False) + checker = SynchronizeStreamChecker() + checker.check_synchronize(dataset) + self.assertFalse(checker.synchronize_issues) + + def _get_mock_dataset(self, total_count, slow_synchronize_stream, is_empty_dataset=False): + dataset = TimelineEvent() + if is_empty_dataset: + return dataset + + dataset["synchronize_stream"] = TimelineEvent( + dict( + total_count=total_count, + slow_synchronize_stream=slow_synchronize_stream, + rule=dict(max_synchronize_num=10, problem="", solutions=[]), + ) + ) + return dataset + + +if __name__ == '__main__': + tester = TestSynchronizeChecker() + tester.test_no_synchronize_stream() + tester.test_max_synchronize_stream() diff --git a/profiler/test/ut/advisor/compute_advice/test_frequency_advice.py b/profiler/test/ut/advisor/compute_advice/test_frequency_advice.py new file mode 100644 index 00000000000..51acf3b8e24 --- /dev/null +++ b/profiler/test/ut/advisor/compute_advice/test_frequency_advice.py @@ -0,0 +1,145 @@ +import os +import shutil +import stat +import json + +import unittest +from profiler.advisor.interface.interface import Interface +from profiler.advisor.common.analyzer_scopes import SupportedScopes + + +class TestFrequencyAdvice(unittest.TestCase): + TMP_DIR = "./ascend_pt" + OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT" + DEVICE_DIR = "./ascend_pt/PROF_000001_20240415174447255_OAANHDOMMJMHGIFC/device_0" + interface = None + err_interface = None + + def tearDown(self): + if os.path.exists(TestFrequencyAdvice.TMP_DIR): + shutil.rmtree(TestFrequencyAdvice.TMP_DIR) + self.clear_htmls() + + def setUp(self): + if os.path.exists(TestFrequencyAdvice.TMP_DIR): + shutil.rmtree(TestFrequencyAdvice.TMP_DIR) + if not os.path.exists(TestFrequencyAdvice.TMP_DIR): + os.makedirs(TestFrequencyAdvice.TMP_DIR) + if not os.path.exists(TestFrequencyAdvice.OUTPUT_DIR): + os.makedirs(TestFrequencyAdvice.OUTPUT_DIR) + if not os.path.exists(TestFrequencyAdvice.DEVICE_DIR): + os.makedirs(TestFrequencyAdvice.DEVICE_DIR) + self.clear_htmls() + + @classmethod + def clear_htmls(cls): + current_path = os.path.dirname(os.path.abspath(__file__)) + for filename in os.listdir(current_path): + # 检查文件是否以“att”开头 + if filename.startswith("att"): + # 构建文件的完整路径 + file_path = os.path.join(current_path, filename) + # 删除文件 + os.remove(file_path) + + @classmethod + def get_basic_trace_view(cls): + # Python pid + py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}} + # ascend pid + ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}} + # ascend pid + cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}} + # ascend hardware ops + ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4, + "args": {"Task Type": "AI_CORE"}} + ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106888", "dur": 80, "tid": 3, "pid": 4, + "args": {"Task Type": "AI_CORE"}} + # flow event + flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}} + flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}} + return [py_pid_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, flow_event_s, flow_event_e] + + @classmethod + def create_info_json(cls): + info = { + "DeviceInfo": [ + { + "id": 7, + "env_type": 3, + "ctrl_cpu_id": "ARMv8_Cortex_A55", + "ctrl_cpu_core_num": 1, + "ctrl_cpu_endian_little": 1, + "ts_cpu_core_num": 0, + "ai_cpu_core_num": 6, + "ai_core_num": 25, + "ai_cpu_core_id": 2, + "ai_core_id": 0, + "aicpu_occupy_bitmap": 252, + "ctrl_cpu": "0", + "ai_cpu": "2,3,4,5,6", + "aiv_num": 50, + "hwts_frequency": "49.999001", + "aic_frequency": "1850", + "aiv_frequency": "1850" + } + ] + } + with os.fdopen(os.open(f"{TestFrequencyAdvice.DEVICE_DIR}/info.json.0", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(info)) + + @classmethod + def create_non_910B_trace_view(cls): + basic_info = cls.get_basic_trace_view() + + # python ops + py_event1 = {"ph": "X", "cat": "python_function", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2, + "pid": 1, + "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}} + py_event2 = {"ph": "X", "cat": "python_function", "name": "slice", "ts": "199", "dur": 200, "tid": 2, "pid": 1, + "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}} + raw_data = [ + *basic_info, py_event1, py_event2 + ] + with os.fdopen(os.open(f"{TestFrequencyAdvice.OUTPUT_DIR}/trace_view.json", + # with os.fdopen(os.open(f"{TestFrequencyAdvice.OUTPUT_DIR}/msprof_20240415174455.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + @classmethod + def create_910B_trace_view(cls): + basic_info = cls.get_basic_trace_view() + + # python ops + py_event1 = {"name": "AI Core Freq", "ts": "1699529623106000.061", "pid": 682820896, "tid": 0, + "args": {"MHz": 1850}, "ph": "C"} + py_event2 = {"name": "AI Core Freq", "ts": "1699529623106770.541", "pid": 682820896, "tid": 0, + "args": {"MHz": 800}, "ph": "C"} + raw_data = [ + *basic_info, py_event1, py_event2 + ] + + with os.fdopen(os.open(f"{TestFrequencyAdvice.OUTPUT_DIR}/trace_view.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + def test_run_should_run_success_when_msprof_not_contain_frequency_data(self): + self.create_info_json() + self.create_non_910B_trace_view() + interface = Interface(profiling_path=self.TMP_DIR) + dimension = "computation" + scope = SupportedScopes.FREQ_ANALYSIS + result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR) + self.assertEqual(0, len(result.data.get("AI Core Frequency", []))) + result.clear() + + def test_run_should_run_success_when_trace_view_contain_frequency_data(self): + self.create_info_json() + self.create_910B_trace_view() + interface = Interface(profiling_path=self.TMP_DIR) + dimension = "computation" + scope = SupportedScopes.FREQ_ANALYSIS + result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR) + self.assertEqual(2, len(result.data.get("AI Core Frequency", dict).get("data", []))) + result.clear() diff --git a/profiler/test/ut/compare_tools/compare_bean/origin_data_bean/test_kernel_details_bean.py b/profiler/test/ut/compare_tools/compare_bean/origin_data_bean/test_kernel_details_bean.py index 7abf8da647d..869ee85570f 100644 --- a/profiler/test/ut/compare_tools/compare_bean/origin_data_bean/test_kernel_details_bean.py +++ b/profiler/test/ut/compare_tools/compare_bean/origin_data_bean/test_kernel_details_bean.py @@ -47,5 +47,5 @@ class TestKernelDetailsBean(unittest.TestCase): self.assertFalse(self.kernel_bean2.is_flash_attention()) def test_is_cube(self): - self.assertTrue(self.kernel_bean2.is_cube()) - self.assertFalse(self.kernel_bean3.is_cube()) + self.assertTrue(self.kernel_bean2.is_matmul()) + self.assertFalse(self.kernel_bean3.is_matmul()) diff --git a/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py b/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py index 44d97b248e6..80734635929 100644 --- a/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py +++ b/profiler/test/ut/compare_tools/profiling_parser/test_base_profiling_parser.py @@ -24,6 +24,11 @@ class ProfilingParser(BaseProfilingParser): self._enable_operator_compare = True self._enable_memory_compare = True self._enable_communication_compare = True + self._enable_kernel_compare = True + self._enable_api_compare = True + + def _update_kernel_details(self): + pass def _update_memory_list(self): pass diff --git a/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py b/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py index 04468721504..d7cb3d0588a 100644 --- a/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py +++ b/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py @@ -68,6 +68,7 @@ class TestGpuProfilingParser(unittest.TestCase): patch("compare_backend.profiling_parser.gpu_profiling_parser.GPUProfilingParser.__init__", return_value=None): res = GPUProfilingParser({}, {}) + res._profiling_type = "GPU" res._trace_events = [TraceEventBean(event) for event in self.trace_events] res._result_data = ProfilingResult("GPU") res._compute_stream_id = 3 -- Gitee From 7a02fd3732936a65d69aa5fbc71fa2938eafde9c Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 1 Aug 2024 03:11:01 +0000 Subject: [PATCH 09/94] monitor gnorm before reduce by model hook --- .../accuracy_tools/kj600/kj600/module_hook.py | 148 +++++++++++------- 1 file changed, 90 insertions(+), 58 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 6d1b25023e5..8b8826a84d0 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -83,12 +83,14 @@ class GradContext: def __init__(self) -> None: self.pre = [] self.post = [] - self.grad_acc = None + self.grad_acc = {} + self.micro_step = -1 def reset(self): self.pre.clear() self.post.clear() - self.grad_acc.fill_(0.) + for k,v in self.grad_acc.items(): + v.fill_(0.) class TrainerMon: @@ -101,7 +103,7 @@ class TrainerMon: self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext) self.optimizer_context = defaultdict(OptimizerContext) self.cc_context = defaultdict(CommunicationContext) - self.grad_context = defaultdict(GradContext) + self.grad_context = GradContext() self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) self.module_rank_list = self.config.get("module_ranks", []) @@ -160,7 +162,12 @@ class TrainerMon: # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) - self.micro_batch_number = 0 + self.micro_batch_number = 1 + self.step = -1 + self.rank = dist.get_rank() + + self.weight_hooked = False + self.optimizer_hooked = False self.param_name_list = [] self.param2name = defaultdict(str) @@ -220,11 +227,47 @@ class TrainerMon: self.hook_optimizer() return + def _get_wg_metric(self, tag='pre_grad'): + grad_dict = {} + for param, name in self.param2name.items(): + grad = param.main_grad if self.params_have_main_grad else param.grad + if grad is None: + print_warn_log(f"grad is None: {name}, maybe something wrong happened.") + continue + key = get_summary_writer_tag_name(name, tag, self.rank) + grad_dict[key] = grad + metric_dict = {} + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, grad_dict, self.eps) + return metric_dict + + def monitor_gnorm_with_ad(self, model, grad_acc_steps): - self._hook_weights(model) self.hook_optimizer() + + if self.print_struct: + self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)} + return + self._register_param_name(model) + self.micro_batch_number = grad_acc_steps + def model_backward_hook(module, input_grad, output_grad): + + if self.wg_distribution: + self.grad_context.micro_step += 1 ## error if vpp + if self.grad_context.micro_step == (self.micro_batch_number - 1): + print('>> pre grad from backward') + self.grad_context.micro_step = -1 + wg_metric_dict = self._get_wg_metric(tag='pre_grad') + self.grad_context.pre.append(wg_metric_dict) + + # for model_chunk in model: + # model_chunk.register_full_backward_hook(model_backward_hook) + + self._hook_weights() + + def build_tbtag_tensor_map(self, module_name, tag, tensor): metrics = {} rank = dist.get_rank() if dist.is_initialized() else None @@ -277,13 +320,12 @@ class TrainerMon: if not self.wg_distribution: return - for name in self.param2name.values(): - context = self.grad_context[name] - for metric_name in self.ops: - write_metrics_tensorboard(metric_name, self.summary_writer, context.pre, step) - write_metrics_tensorboard(metric_name, self.summary_writer, context.post, step) + for metric_name in self.ops: + # write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.pre, step) + write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.post, step) + write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.xx, step) - context.reset() + self.grad_context.reset() def hook_optimizer(self): # in DDP by default use params_have_main_grad @@ -303,7 +345,18 @@ class TrainerMon: context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name) - rank = dist.get_rank() if dist.is_initialized() else None + if self.wg_distribution: + if self.weight_hooked: + print('>>> pre grad from weight hook') + metric_dict = {} + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.grad_acc, self.eps) + # self.grad_context.grad.append(metric_dict) + self.grad_context.xx = [metric_dict] + + wg_metric_dict = self._get_wg_metric(tag='post_grad') + self.grad_context.post.append(wg_metric_dict) + for param, name in self.param2name.items(): if "params_effrank" in self.config and name in self.config["params_effrank"]: context.param_effective_rank[name] = eff_rank(param.detach()) @@ -311,18 +364,6 @@ class TrainerMon: if grad is None: print_warn_log(f"grad is None: {name}, maybe something wrong happened.") continue - if self.wg_distribution: - metric_dict = {} - key = get_summary_writer_tag_name(name, 'post_grad', rank) - for metric_name in self.ops: - metric_dict[metric_name] = get_metrics(metric_name, {key: grad}, self.eps) - self.grad_context[name].post.append(metric_dict) - - metric_dict = {} - key = get_summary_writer_tag_name(name, 'pre_grad', rank) - for metric_name in self.ops: - metric_dict[metric_name] = get_metrics(metric_name, {key: self.grad_context[name].grad_acc}, self.eps) - self.grad_context[name].pre.append(metric_dict) if self.mg_direction: if context.step == 0: @@ -390,6 +431,22 @@ class TrainerMon: else: print_info_log(msg) + def _register_param_name(self, model): + if isinstance(model, list): + if len(model) > 1: + self.vpp = True + self._smallest_rank_print('vpp enabled') + + for vpp_stage, model_chunk in enumerate(model): + prefix = f'{vpp_stage}_' if self.vpp else '' + for param_name, param in model_chunk.named_parameters(): + name = prefix + param_name + for target in self.config['targets'].keys(): + if param_name.startswith(target) and param.requires_grad: + self._smallest_rank_print(f'>> monitoring: {name}') + self.param2name[param] = name + + def _hook_module(self, target_names, module: torch.nn.Module, fwd_or_bkd): if '_modules' not in module.__dict__: # nothing to hook @@ -483,41 +540,16 @@ class TrainerMon: hooked_count += 1 return hooked_count - def _hook_weights(self, model): - self.wg_distribution = True - rank = dist.get_rank() if dist.is_initialized() else None + def _hook_weights(self): + context = self.grad_context - def param_hook(grad, context): + def param_hook(grad, grad_acc): with torch.no_grad(): - context.grad_acc += grad - - def register_hooks(model_chunk, prefix=''): - for param_name, param in model_chunk.named_parameters(): - name = prefix + param_name - for target in self.config['targets'].keys(): - context = self.grad_context[name] - if param_name.startswith(target) and param.requires_grad: - self._smallest_rank_print(f'>> monitoring: {name}') - self.param2name[param] = name - param.register_hook(partial(param_hook, context=context)) - context.grad_acc = torch.zeros_like(param).to(DEVICE) + grad_acc += grad + for param, name in self.param2name.items(): + key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) + context.grad_acc[key] = torch.zeros_like(param).to(DEVICE) + param.register_hook(partial(param_hook, grad_acc=context.grad_acc[key])) - if self.print_struct: - self.module_struct = { - module_name: 1. for module_name, module in model.named_modules()} - return - - if isinstance(model, list): - if len(model) > 1: - self.vpp = True - self._smallest_rank_print('vpp enabled') - - for vpp_stage, model_chunk in enumerate(model): - prefix = f'{vpp_stage}_' if self.vpp else '' - register_hooks(model_chunk, prefix=prefix) - - else: - register_hooks(model) - - + self.weight_hooked = True \ No newline at end of file -- Gitee From 392d785a34a3de7eedfebe48668ee90850c88b7e Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 1 Aug 2024 03:21:59 +0000 Subject: [PATCH 10/94] add package data --- debug/accuracy_tools/kj600/pyproject.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/pyproject.toml b/debug/accuracy_tools/kj600/pyproject.toml index 5df96856334..dd5faebc38e 100644 --- a/debug/accuracy_tools/kj600/pyproject.toml +++ b/debug/accuracy_tools/kj600/pyproject.toml @@ -7,7 +7,6 @@ name = "kj600" version = "0.0.1" dependencies = [ "torch", - "torch_npu", "torchvision", "tensorboard", "matplotlib", @@ -16,4 +15,7 @@ dependencies = [ ] [tool.setuptools.packages] -find = {} # Scan the project directory with the default parameters \ No newline at end of file +find = {} # Scan the project directory with the default parameters + +[tool.setuptools.package-data] +kj600 = ["distributed/*.yaml"] \ No newline at end of file -- Gitee From 133c95ce5a0556828c432b3dfc84834391005255 Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 07:58:46 +0000 Subject: [PATCH 11/94] support csv output --- .../kj600/kj600/anomaly_detect.py | 53 ++++++++++- .../accuracy_tools/kj600/kj600/module_hook.py | 95 ++++++++++--------- .../kj600/kj600/module_metric.py | 27 ++++-- debug/accuracy_tools/kj600/kj600/utils.py | 27 +++++- 4 files changed, 150 insertions(+), 52 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index cbd7b6daa2f..5a98aabb863 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -1,10 +1,13 @@ +import os import statistics as st from abc import ABC from typing import List +import pandas as pd import sys from torch.utils.tensorboard import SummaryWriter from collections import defaultdict -from kj600.utils import print_info_log +from kj600.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory + class ScanRule(ABC): def apply(self, history, cur): @@ -59,6 +62,54 @@ class bcolors: BOLD = '\033[1m' UNDERLINE = '\033[4m' + +class CSVWriterWithAD: + def __init__(self, path, ad_rules, job_id, anomaly_inform=False): + self.path = path + create_directory(path) + self.tag2scalars = defaultdict(list) + self.ad_rules = ad_rules + self.job_id = job_id + self.anomaly_inform = anomaly_inform + self.context_dict = defaultdict(list) + self.header = [] + + def write_csv(self, prefix, step): + if len(self.context_dict) == 0: + return + filepath = os.path.join(self.path, f'{prefix}_{step}.csv') + if not os.path.exists(filepath): + make_file_safety(filepath) + data_frame = pd.DataFrame(columns=self.header) + data_frame.to_csv(filepath, index=False) + + check_file_valid_writable(filepath) + new_data = pd.DataFrame([[name]+metric_value for name, metric_value in self.context_dict.items()]) + new_data.to_csv(filepath, mode='a+', header=False, index=False) + self.context_dict = defaultdict(list) + + def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): + new_avg = avg = scalar_value + if tag in self.tag2scalars: + N = len(self.tag2scalars[tag]) + _, avg = self.tag2scalars[tag][-1] + new_avg = (avg*N + scalar_value)/(N + 1) + self.tag2scalars[tag].append((scalar_value, new_avg)) + detected, rule_name = self._ad(scalar_value, history=avg) + if detected: + print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") + exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" + if self.anomaly_inform: + self.anomaly_inform.run(exception_message, self.job_id) + + name = tag.split('/')[0] + self.context_dict[name].append(scalar_value) + + def _ad(self, scalar_value, history): + return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) + + + class SummaryWriterWithAD(SummaryWriter): def __init__(self, path, ad_rules, job_id, anomaly_inform=False): super().__init__(path) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 8b8826a84d0..129323340b5 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -11,9 +11,9 @@ from kj600.module_spec_verifier import get_config, validate_config_spec from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer -from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD +from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD, CSVWriterWithAD from kj600.anomaly_inform import AnomalyInformFactory -from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics +from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate from kj600.utils import print_warn_log, print_info_log, get_param_struct @@ -83,14 +83,17 @@ class GradContext: def __init__(self) -> None: self.pre = [] self.post = [] - self.grad_acc = {} + self.acc_metric = [] + self.acc = {} self.micro_step = -1 def reset(self): self.pre.clear() self.post.clear() - for k,v in self.grad_acc.items(): + self.acc_metric.clear() + for k,v in self.acc.items(): v.fill_(0.) + class TrainerMon: @@ -107,12 +110,12 @@ class TrainerMon: self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) self.module_rank_list = self.config.get("module_ranks", []) + self.format = self.config.get('format', 'tensorboard') self.eps = self.config.get('eps', 1e-8) self.ops = self.config.get('ops', []) self.xy_distribution = self.config.get('xy_distribution', False) if not self.xy_distribution: print_rank_0("> module input/output input_grad/output_grad is not monitored. ") - # backward hook cause megatron-lm pipeline parallel schedule assert exception. # TBD: backward hook cause output tensor is view of some base tensor. root cause invesigation pending. self.forward_only = self.config.get('forward_only', False) @@ -153,12 +156,19 @@ class TrainerMon: output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] + + if self.format == 'tensorboard': + writer = SummaryWriterWithAD + self.write_metrics = write_metrics_tensorboard + elif self.format == 'csv': + writer = CSVWriterWithAD + self.write_metrics = write_metrics_csv if dist.is_initialized(): if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0: - self.summary_writer = SummaryWriterWithAD( + self.summary_writer = writer( os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) else: - self.summary_writer = SummaryWriterWithAD(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) + self.summary_writer = writer(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) @@ -221,13 +231,13 @@ class TrainerMon: for name, param in model.named_parameters(): print_rank_0(f"\t{name}") for target_module, _ in self.config['targets'].items(): - if name.startswith(target_module): # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 + if name.startswith(target_module) and param.requires_grad: # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 self.param_name_list.append(name) self.param2name[param] = name self.hook_optimizer() return - def _get_wg_metric(self, tag='pre_grad'): + def _get_wg_metric(self, tag): grad_dict = {} for param, name in self.param2name.items(): grad = param.main_grad if self.params_have_main_grad else param.grad @@ -244,28 +254,15 @@ class TrainerMon: def monitor_gnorm_with_ad(self, model, grad_acc_steps): self.hook_optimizer() + self.micro_batch_number = grad_acc_steps + self.wg_distribution = True if self.print_struct: self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)} return self._register_param_name(model) - - self.micro_batch_number = grad_acc_steps - - def model_backward_hook(module, input_grad, output_grad): - - if self.wg_distribution: - self.grad_context.micro_step += 1 ## error if vpp - if self.grad_context.micro_step == (self.micro_batch_number - 1): - print('>> pre grad from backward') - self.grad_context.micro_step = -1 - wg_metric_dict = self._get_wg_metric(tag='pre_grad') - self.grad_context.pre.append(wg_metric_dict) - - # for model_chunk in model: - # model_chunk.register_full_backward_hook(model_backward_hook) - - self._hook_weights() + self._hook_model_for_grad_acc(model) + # self._hook_weights() def build_tbtag_tensor_map(self, module_name, tag, tensor): @@ -305,25 +302,23 @@ class TrainerMon: for _, fwd_context in self.module_fwd_hook_context_by_module.items(): if not len(fwd_context.actv) == self.micro_batch_number: print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") - for metric_name in self.ops: - write_metrics_tensorboard(metric_name, self.summary_writer, fwd_context.actv, step) + self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv') fwd_context.actv.clear() for _, bwd_context in self.module_bwd_hook_context_by_module.items(): if not len(bwd_context.actvgrad) == self.micro_batch_number: print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") - for metric_name in self.ops: - write_metrics_tensorboard(metric_name, self.summary_writer, bwd_context.actvgrad, step) + self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') bwd_context.actvgrad.clear() def write_grad_tb(self, step): if not self.wg_distribution: return - for metric_name in self.ops: - # write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.pre, step) - write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.post, step) - write_metrics_tensorboard(metric_name, self.summary_writer, self.grad_context.xx, step) + self.write_metrics(self.ops, self.summary_writer, self.grad_context.pre, step, 'grad_unreduced') + self.write_metrics(self.ops, self.summary_writer, self.grad_context.post, step, 'grad_reduced') + if self.weight_hooked: + self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_accumulated') self.grad_context.reset() @@ -347,12 +342,10 @@ class TrainerMon: if self.wg_distribution: if self.weight_hooked: - print('>>> pre grad from weight hook') metric_dict = {} for metric_name in self.ops: - metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.grad_acc, self.eps) - # self.grad_context.grad.append(metric_dict) - self.grad_context.xx = [metric_dict] + metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.acc, self.eps) + self.grad_context.acc_metric = [metric_dict] wg_metric_dict = self._get_wg_metric(tag='post_grad') self.grad_context.post.append(wg_metric_dict) @@ -388,6 +381,7 @@ class TrainerMon: cc_metrics = self.generate_cc_metrics(k, c) for op, m in cc_metrics.items(): metric_dict[op].update(m) + if not metric_dict: return context.metric_list.append(metric_dict) @@ -407,10 +401,8 @@ class TrainerMon: for param_name, _ in context.param_adam_ratio.items(): self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer) - for metric_name in self.ops: - if not context.metric_list: - break - write_metrics_tensorboard(metric_name, self.summary_writer, context.metric_list, context.step) + if context.metric_list: + self.write_metrics(self.ops, self.summary_writer, context.metric_list, context.step, 'other') context.metric_list.clear() context.step += 1 @@ -540,6 +532,21 @@ class TrainerMon: hooked_count += 1 return hooked_count + def _hook_model_for_grad_acc(self, model): + def model_backward_hook(module, input_grad, output_grad): + model_chunk.micro_step += 1 ## error if vpp + if model_chunk.micro_step == (self.micro_batch_number): + model_chunk.micro_step = 0 + wg_metric_dict = self._get_wg_metric(tag='pre_grad') + self.grad_context.pre.append(wg_metric_dict) + + if not isinstance(model, list): + model = [model] + + for model_chunk in model: + setattr(model_chunk,'micro_step', 0) + model_chunk.register_full_backward_hook(model_backward_hook) + def _hook_weights(self): context = self.grad_context @@ -549,7 +556,7 @@ class TrainerMon: for param, name in self.param2name.items(): key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) - context.grad_acc[key] = torch.zeros_like(param).to(DEVICE) - param.register_hook(partial(param_hook, grad_acc=context.grad_acc[key])) + context.acc[key] = torch.zeros_like(param).to(DEVICE) + param.register_hook(partial(param_hook, grad_acc=context.acc[key])) self.weight_hooked = True \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index e09536b072c..57934ed8199 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -150,9 +150,24 @@ def get_metrics(metric_name, tag2tensor, eps): raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e -def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step): - try: - fun_metric = config_metric_registry[metric_name] - return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) - except KeyError as e: - raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e +def write_metrics_tensorboard(ops, summary_writer, metric_value, step, prefix=''): + for metric_name in ops: + try: + fun_metric = config_metric_registry[metric_name] + fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) + except KeyError as e: + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e + +def write_metrics_csv(ops, summary_writer, metric_value, step, preifx=''): + for metric_name in ops: + try: + fun_metric = config_metric_registry[metric_name] + fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) + + except KeyError as e: + print(e) + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e + + if not summary_writer.header: + summary_writer.header = ['param_name'] + ops + summary_writer.write_csv(preifx, step) \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/utils.py b/debug/accuracy_tools/kj600/kj600/utils.py index 53d47d99886..3aed6911c44 100644 --- a/debug/accuracy_tools/kj600/kj600/utils.py +++ b/debug/accuracy_tools/kj600/kj600/utils.py @@ -107,4 +107,29 @@ def check_file_valid_readable(path): def check_file_valid_writable(path): check_file_valid(path) check_path_writability(path) - \ No newline at end of file + + +def make_file_safety(file_path: str, permission=0o640): + if os.path.islink(file_path): + raise RuntimeError(f"Invalid soft link path: {file_path}") + file_real_path = os.path.realpath(file_path) + if os.path.exists(file_real_path): + return + parent_path = os.path.dirname(file_real_path) + if not os.path.exists(parent_path): + os.makedirs(parent_path, mode=0o750, exist_ok=True) + if not os.access(parent_path, os.W_OK): + raise PermissionError(f"The path {parent_path} is not writable!") + try: + os.close(os.open(file_real_path, os.O_WRONLY | os.O_CREAT, permission)) + except OSError as e: + raise RuntimeError("Can't create file: " + file_real_path) from e + os.chmod(file_real_path, permission) + + +def create_directory(dir_path): + dir_path = os.path.realpath(dir_path) + try: + os.makedirs(dir_path, mode=0o750, exist_ok=True) + except OSError as ex: + raise RuntimeError("Failed to create directory. Please check the path permission or disk space.") from ex \ No newline at end of file -- Gitee From 62554172c93e23f586fd1ebc7b69fca6f773832d Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 08:27:34 +0000 Subject: [PATCH 12/94] temporary disable ad inform --- .../accuracy_tools/kj600/kj600/module_hook.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 129323340b5..af9f26bba19 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -12,7 +12,7 @@ from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, Optim from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD, CSVWriterWithAD -from kj600.anomaly_inform import AnomalyInformFactory +# from kj600.anomaly_inform import AnomalyInformFactory from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate from kj600.utils import print_warn_log, print_info_log, get_param_struct @@ -149,9 +149,11 @@ class TrainerMon: alert_setting = self.config.get('alert', {"rules":[]}) self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"]) - anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None + # anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None + anomaly_inform = None self.optimizer_hooked = False + self.param_registered = False self.vpp = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') @@ -179,7 +181,6 @@ class TrainerMon: self.weight_hooked = False self.optimizer_hooked = False - self.param_name_list = [] self.param2name = defaultdict(str) self.mix_precision_optimizer_mon = OptimizerMonFactory.create_optimizer_mon(opt_ty) @@ -232,7 +233,6 @@ class TrainerMon: print_rank_0(f"\t{name}") for target_module, _ in self.config['targets'].items(): if name.startswith(target_module) and param.requires_grad: # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 - self.param_name_list.append(name) self.param2name[param] = name self.hook_optimizer() return @@ -263,6 +263,7 @@ class TrainerMon: self._register_param_name(model) self._hook_model_for_grad_acc(model) # self._hook_weights() + self.hook_modules(model[0], grad_acc_steps) def build_tbtag_tensor_map(self, module_name, tag, tensor): @@ -300,6 +301,7 @@ class TrainerMon: if not self.xy_distribution: return for _, fwd_context in self.module_fwd_hook_context_by_module.items(): + print(fwd_context.actv) if not len(fwd_context.actv) == self.micro_batch_number: print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv') @@ -424,6 +426,8 @@ class TrainerMon: print_info_log(msg) def _register_param_name(self, model): + if self.param_registered: + return if isinstance(model, list): if len(model) > 1: self.vpp = True @@ -438,6 +442,8 @@ class TrainerMon: self._smallest_rank_print(f'>> monitoring: {name}') self.param2name[param] = name + self.param_registered = True + def _hook_module(self, target_names, module: torch.nn.Module, fwd_or_bkd): if '_modules' not in module.__dict__: @@ -519,7 +525,7 @@ class TrainerMon: context.step += 1 return - hooked_count = 0 + hooked_count = 0 for name, submodule in module.named_modules(): self.module_struct[name] = {} if name in target_names: @@ -559,4 +565,4 @@ class TrainerMon: context.acc[key] = torch.zeros_like(param).to(DEVICE) param.register_hook(partial(param_hook, grad_acc=context.acc[key])) - self.weight_hooked = True \ No newline at end of file + self.weight_hooked = True -- Gitee From 554ed890efb176a36d4a3ef37949559fde89e35e Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 08:41:45 +0000 Subject: [PATCH 13/94] fset status for opt hook --- debug/accuracy_tools/kj600/kj600/module_hook.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index af9f26bba19..1b658f33e05 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -227,7 +227,6 @@ class TrainerMon: return if not self.optimizer_hooked: - self.optimizer_hooked = True print_rank_0("> parameter names:") for name, param in model.named_parameters(): print_rank_0(f"\t{name}") @@ -409,9 +408,13 @@ class TrainerMon: context.step += 1 return + + if self.optimizer_hooked: + return if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list): register_optimizer_step_pre_hook(optimizer_pre_step_hook) register_optimizer_step_post_hook(optimizer_post_step_hook) + self.optimizer_hooked = True return def _smallest_rank_print(self, msg): -- Gitee From 5440ed2082e24af15b31d896e36eaed0679b04e0 Mon Sep 17 00:00:00 2001 From: jiandaobao Date: Mon, 5 Aug 2024 14:52:39 +0800 Subject: [PATCH 14/94] Gradient Monitoring Anomaly Timing Feature --- .../kj600/kj600/anomaly_analyse.py | 84 +++++ .../kj600/kj600/anomaly_detect.py | 87 ++++- .../kj600/distributed/wrap_distributed.py | 7 + .../accuracy_tools/kj600/kj600/file_check.py | 345 ++++++++++++++++++ .../accuracy_tools/kj600/kj600/module_hook.py | 85 ++++- .../kj600/kj600/module_metric.py | 8 +- 6 files changed, 589 insertions(+), 27 deletions(-) create mode 100644 debug/accuracy_tools/kj600/kj600/anomaly_analyse.py create mode 100644 debug/accuracy_tools/kj600/kj600/file_check.py diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py new file mode 100644 index 00000000000..737ec26dd6e --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import fcntl +import json +from pathlib import Path +from kj600.utils import print_info_log +from kj600.file_check import ( + change_mode, + FileCheckConst, + check_path_before_create, + FileChecker, +) + + +class AnomalyDataWriter: + + def __init__(self, dump_path, rank) -> None: + self.dump_path = dump_path + self.dump_rank_dir = os.path.join(self.dump_path, f"rank{rank}") + self.json_path = os.path.join(self.dump_rank_dir, "anomaly.json") + + def init_detected_json(self): + """初始化落盘文件""" + check_path_before_create(self.dump_path) + if not os.path.exists(self.dump_path): + Path(self.dump_path).mkdir( + mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True + ) + file_check = FileChecker(self.dump_path, FileCheckConst.DIR) + file_check.common_check() + + if not os.path.exists(self.dump_rank_dir): + Path(self.dump_rank_dir).mkdir( + FileCheckConst.DATA_DIR_AUTHORITY, parents=True, exist_ok=True + ) + + if os.path.exists(self.json_path): + os.remove(self.json_path) + Path(self.json_path).touch() + change_mode(self.json_path, FileCheckConst.DATA_FILE_AUTHORITY) + + def write_detected_json(self, anomalies): + anomalies_json = self._get_anomaly_dict(anomalies) + print_info_log(f"detected.json is at {self.dump_rank_dir}. ") + if Path(self.json_path).exists() and os.path.getsize(self.json_path) > 0: + with open(self.json_path, "r+") as f: + fcntl.flock(f, fcntl.LOCK_EX) + data_to_write = json.load(f) + fcntl.flock(f, fcntl.LOCK_UN) + else: + data_to_write = {} + data_to_write.update(anomalies_json) + with open(self.json_path, "w+") as f: + fcntl.flock(f, fcntl.LOCK_EX) + json.dump(data_to_write, f, indent=1) + fcntl.flock(f, fcntl.LOCK_UN) + + @staticmethod + def _get_anomaly_dict(anomalies): + """将GradAnomalyData列表转换为json + """ + anomalies_json = {} + for anomaly in anomalies: + anomalies_json.update({anomaly.get_key(): anomaly.to_dict()}) + return anomalies_json + +class AnomalyDataLoader: + pass \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index cbd7b6daa2f..0ccb481ab0f 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -5,6 +5,7 @@ import sys from torch.utils.tensorboard import SummaryWriter from collections import defaultdict from kj600.utils import print_info_log +from dataclasses import dataclass, field class ScanRule(ABC): def apply(self, history, cur): @@ -59,13 +60,86 @@ class bcolors: BOLD = '\033[1m' UNDERLINE = '\033[4m' +class AnomalyDataFactory(ABC): + def __init__(self, rank, pp_stage, group_mates): + super().__init__() + self.rank = rank + self.pp_stage = pp_stage + self.group_mates = group_mates + self.call_id = 0 + self.micro_step = 0 + self.vpp_stage = 0 + + def set_context(self, context): + """根据当前GradContext信息更新call_id vpp_stage等信息 + """ + if hasattr(context, "call_id"): + self.call_id = context.call_id + if hasattr(context, "vpp_stage"): + self.vpp_stage = context.vpp_stage + + def create(self, tag_name, message, step): + """如果检查出异常, 调用当前接口生成GradAnomalyData实例 + """ + return GradAnomalyData( + self.rank, + step, + self.micro_step, + self.pp_stage, + self.vpp_stage, + self.call_id, + tag_name, + message, + self.group_mates + ) + +@dataclass(eq=True) +class GradAnomalyData: + rank: int = 0 + step: int = 0 + micro_step: int = 0 + pp_stage: int = 0 + vpp_stage: int = 0 + call_id: int = 0 + tag_name: str = field(default=None, compare=False) + message: str = field(default="", compare=False) + group_mates: list = field(default=None, compare=False) + + def __lt__(self, other): + if not isinstance(other, GradAnomalyData): + return NotImplemented + if self.step < other.step: + return True + if self.micro_step < other.micro_step: + return True + if self.pp_stage > other.pp_stage: + return True + if self.vpp_stage > other.vpp_stage: + return True + if self.call_id < other.call_id: + return True + return False + + def __le__(self, other): + if not isinstance(other, GradAnomalyData): + return NotImplemented + return self == other or self < other + + def to_dict(self): + return self.__dict__ + + def get_key(self): + return self.tag_name + "_call_" + str(self.call_id) + class SummaryWriterWithAD(SummaryWriter): - def __init__(self, path, ad_rules, job_id, anomaly_inform=False): + def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None): super().__init__(path) self.tag2scalars = defaultdict(list) self.ad_rules = ad_rules self.job_id = job_id self.anomaly_inform = anomaly_inform + self.anomaly_factory = anomaly_factory + self.anomalies = [] def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): new_avg = avg = scalar_value @@ -76,11 +150,18 @@ class SummaryWriterWithAD(SummaryWriter): self.tag2scalars[tag].append((scalar_value, new_avg)) detected, rule_name = self._ad(scalar_value, history=avg) if detected: - print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") - exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" + exception_message = f"Rule {rule_name} reports anomaly signal in {tag} at step {global_step}." + print_info_log(f"{bcolors.WARNING}> {exception_message}{bcolors.ENDC}") if self.anomaly_inform: self.anomaly_inform.run(exception_message, self.job_id) + if self.anomaly_factory: + self.anomalies.append(self.anomaly_factory.create(tag, exception_message, global_step)) return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) def _ad(self, scalar_value, history): return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) + + def get_anomalies(self): + """返回已检测到的异常列表 + """ + return self.anomalies \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py b/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py index fad007fe35c..edd5829d50a 100644 --- a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py +++ b/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py @@ -106,6 +106,13 @@ class ApiRegistry: dist.Work.wait = wrapped_wait(dist.Work) +def check_process_group(process_group): + group = None + if isinstance(process_group, dist.ProcessGroup): + group = process_group + if group is None: + group = dist.GroupMember.WORLD # default group + return group def stack_filter(stack): for pattern in StackBlackList: diff --git a/debug/accuracy_tools/kj600/kj600/file_check.py b/debug/accuracy_tools/kj600/kj600/file_check.py new file mode 100644 index 00000000000..6adcf42b0c6 --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/file_check.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import re + +from kj600.utils import print_info_log + + +class CodedException(Exception): + def __init__(self, code, error_info=""): + super().__init__() + self.code = code + self.error_info = self.err_strs.get(code) + error_info + + def __str__(self): + return self.error_info + + +class FileCheckException(CodedException): + INVALID_FILE_ERROR = 0 + FILE_PERMISSION_ERROR = 1 + SOFT_LINK_ERROR = 2 + ILLEGAL_PATH_ERROR = 3 + ILLEGAL_PARAM_ERROR = 4 + FILE_TOO_LARGE_ERROR = 5 + + err_strs = { + SOFT_LINK_ERROR: "[kj600] 检测到软链接: ", + FILE_PERMISSION_ERROR: "[kj600] 文件权限错误: ", + INVALID_FILE_ERROR: "[kj600] 无效文件: ", + ILLEGAL_PATH_ERROR: "[kj600] 非法文件路径: ", + ILLEGAL_PARAM_ERROR: "[kj600] 非法打开方式: ", + FILE_TOO_LARGE_ERROR: "[kj600] 文件过大: ", + } + + +class FileCheckConst: + """ + Class for file check const + """ + + READ_ABLE = "read" + WRITE_ABLE = "write" + READ_WRITE_ABLE = "read and write" + DIRECTORY_LENGTH = 4096 + FILE_NAME_LENGTH = 255 + FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$" + FILE_PATTERN = r"^[a-zA-Z0-9_./-]+$" + JSON_SUFFIX = ".json" + MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 + DIR = "dir" + FILE = "file" + DATA_DIR_AUTHORITY = 0o750 + DATA_FILE_AUTHORITY = 0o640 + FILE_SIZE_DICT = { + JSON_SUFFIX: MAX_JSON_SIZE, + } + + +class FileChecker: + """ + The class for check file. + + Attributes: + file_path: The file or dictionary path to be verified. + path_type: file or dictionary + ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability + file_type(str): The correct file type for file + """ + + def __init__( + self, file_path, path_type, ability=None, file_type=None, is_script=True + ): + self.file_path = file_path + self.path_type = self._check_path_type(path_type) + self.ability = ability + self.file_type = file_type + self.is_script = is_script + + @staticmethod + def _check_path_type(path_type): + if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]: + print_info_log( + f"The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}." + ) + raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) + return path_type + + def common_check(self): + """ + 功能:用户校验基本文件权限:软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符 + 注意:文件后缀的合法性,非通用操作,可使用其他独立接口实现 + """ + check_path_exists(self.file_path) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + check_path_type(self.file_path, self.path_type) + self.check_path_ability() + if self.is_script: + check_path_owner_consistent(self.file_path) + check_path_pattern_vaild(self.file_path) + check_common_file_size(self.file_path) + check_file_suffix(self.file_path, self.file_type) + return self.file_path + + def check_path_ability(self): + if self.ability == FileCheckConst.WRITE_ABLE: + check_path_writability(self.file_path) + if self.ability == FileCheckConst.READ_ABLE: + check_path_readability(self.file_path) + if self.ability == FileCheckConst.READ_WRITE_ABLE: + check_path_readability(self.file_path) + check_path_writability(self.file_path) + + +class FileOpen: + """ + The class for open file by a safe way. + + Attributes: + file_path: The file or dictionary path to be opened. + mode(str): The file open mode + """ + + SUPPORT_READ_MODE = ["r", "rb"] + SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"] + SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"] + + def __init__(self, file_path, mode, encoding="utf-8"): + self.file_path = file_path + self.mode = mode + self.encoding = encoding + self._handle = None + + def __enter__(self): + self.check_file_path() + binary_mode = "b" + if binary_mode not in self.mode: + self._handle = open(self.file_path, self.mode, encoding=self.encoding) + else: + self._handle = open(self.file_path, self.mode) + return self._handle + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._handle: + self._handle.close() + + def check_file_path(self): + support_mode = ( + self.SUPPORT_READ_MODE + + self.SUPPORT_WRITE_MODE + + self.SUPPORT_READ_WRITE_MODE + ) + if self.mode not in support_mode: + print_info_log("File open not support %s mode" % self.mode) + raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + self.check_ability_and_owner() + check_path_pattern_vaild(self.file_path) + if os.path.exists(self.file_path): + check_common_file_size(self.file_path) + + def check_ability_and_owner(self): + if self.mode in self.SUPPORT_READ_MODE: + check_path_exists(self.file_path) + check_path_readability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path): + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path): + check_path_readability(self.file_path) + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + + +def check_link(path): + abs_path = os.path.abspath(path) + if os.path.islink(abs_path): + print_info_log("The file path {} is a soft link.".format(path)) + raise FileCheckException(FileCheckException.SOFT_LINK_ERROR) + + +def check_path_length(path, name_length=None): + file_max_name_length = ( + name_length if name_length else FileCheckConst.FILE_NAME_LENGTH + ) + if ( + len(path) > FileCheckConst.DIRECTORY_LENGTH + or len(os.path.basename(path)) > file_max_name_length + ): + print_info_log("The file path length exceeds limit.") + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_path_exists(path): + if not os.path.exists(path): + print_info_log("The file path %s does not exist." % path) + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_path_readability(path): + if not os.access(path, os.R_OK): + print_info_log("The file path %s is not readable." % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_writability(path): + if not os.access(path, os.W_OK): + print_info_log("The file path %s is not writable." % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_executable(path): + if not os.access(path, os.X_OK): + print_info_log("The file path %s is not executable." % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_other_user_writable(path): + st = os.stat(path) + if st.st_mode & 0o002: + print_info_log( + "The file path %s may be insecure because other users have write permissions. " + % path + ) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_owner_consistent(path): + file_owner = os.stat(path).st_uid + if file_owner != os.getuid(): + print_info_log( + "The file path %s may be insecure because is does not belong to you." % path + ) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_pattern_vaild(path): + if not re.match(FileCheckConst.FILE_VALID_PATTERN, path): + print_info_log("The file path %s contains special characters." % (path)) + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_file_size(file_path, max_size): + file_size = os.path.getsize(file_path) + if file_size >= max_size: + print_info_log(f"The size of file path {file_path} exceeds {max_size} bytes.") + raise FileCheckException(FileCheckException.FILE_TOO_LARGE_ERROR) + + +def check_common_file_size(file_path): + if os.path.isfile(file_path): + for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): + if file_path.endswith(suffix): + check_file_size(file_path, max_size) + break + + +def check_file_suffix(file_path, file_suffix): + if file_suffix: + if not file_path.endswith(file_suffix): + print_info_log(f"The {file_path} should be a {file_suffix} file!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + + +def check_path_type(file_path, file_type): + if file_type == FileCheckConst.FILE: + if not os.path.isfile(file_path): + print_info_log(f"The {file_path} should be a file!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + if file_type == FileCheckConst.DIR: + if not os.path.isdir(file_path): + print_info_log(f"The {file_path} should be a dictionary!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + + +def create_directory(dir_path): + """ + Function Description: + creating a directory with specified permissions + Parameter: + dir_path: directory path + Exception Description: + when invalid data throw exception + """ + dir_path = os.path.realpath(dir_path) + try: + os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) + except OSError as ex: + raise FileCheckException( + FileCheckException.ILLEGAL_PATH_ERROR, + "Failed to create {}. Please check the path permission or disk space .{}".format( + dir_path, str(ex) + ), + ) from ex + + +def check_path_before_create(path): + if path_len_exceeds_limit(path): + raise FileCheckException( + FileCheckException.ILLEGAL_PATH_ERROR, "The file path length exceeds limit." + ) + + if not re.match(FileCheckConst.FILE_PATTERN, os.path.realpath(path)): + raise FileCheckException( + FileCheckException.ILLEGAL_PATH_ERROR, + "The file path {} contains special characters.".format(path), + ) + + +def change_mode(path, mode): + if not os.path.exists(path) or os.path.islink(path): + return + try: + os.chmod(path, mode) + except PermissionError as ex: + raise FileCheckException( + FileCheckException.FILE_PERMISSION_ERROR, + "Failed to change {} authority. {}".format(path, str(ex)), + ) from ex + + +def path_len_exceeds_limit(file_path): + return ( + len(os.path.realpath(file_path)) > FileCheckConst.DIRECTORY_LENGTH + or len(os.path.basename(file_path)) > FileCheckConst.FILE_NAME_LENGTH + ) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index d13736aeb20..6108fcde55f 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -11,10 +11,11 @@ from kj600.module_spec_verifier import get_config, validate_config_spec from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer -from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD +from kj600.anomaly_detect import AnomalyScanner, AnomalyDataFactory, SummaryWriterWithAD from kj600.anomaly_inform import AnomalyInformFactory +from kj600.anomaly_analyse import AnomalyDataWriter from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics -from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate +from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, check_process_group from kj600.utils import print_warn_log, print_info_log, get_param_struct @@ -84,6 +85,8 @@ class GradContext: self.pre = [] self.post = [] self.grad_acc = None + self.vpp_stage = 0 + self.call_id = 0 def reset(self): self.pre.clear() @@ -96,12 +99,13 @@ class TrainerMon: tensor_metrics = TensorMetrics() # opt_ty: "Megatron_Float16OptimizerWithFloat16Params" or "Megatron_DistributedOptimizer" - def __init__(self, config_file_path, params_have_main_grad=True, opt_ty=None) -> None: + def __init__(self, config_file_path, process_group=None, params_have_main_grad=True, opt_ty=None) -> None: self.module_fwd_hook_context_by_module = defaultdict(ModuleHookContext) self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext) self.optimizer_context = defaultdict(OptimizerContext) self.cc_context = defaultdict(CommunicationContext) self.grad_context = defaultdict(GradContext) + self.process_group = check_process_group(process_group) self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) self.module_rank_list = self.config.get("module_ranks", []) @@ -145,17 +149,32 @@ class TrainerMon: self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"]) anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None - self.optimizer_hooked = False + self.vpp = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] if dist.is_initialized(): - if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0: + rank = dist.get_rank() + pp_stage = dist.get_group_rank(self.process_group, rank) + group_mates = dist.get_process_group_ranks(self.process_group) + if (rank in self.module_rank_list) or len(self.module_rank_list) == 0: + self.anomaly_data_factory = AnomalyDataFactory(rank, pp_stage, group_mates) self.summary_writer = SummaryWriterWithAD( - os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) + os.path.join(output_base_dir, f"{cur_time}-rank{rank}-{unique_id}"), + self.alert_rules, + unique_id, + anomaly_inform, + self.anomaly_data_factory) else: - self.summary_writer = SummaryWriterWithAD(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) + rank = 0 + self.anomaly_data_factory = AnomalyDataFactory(rank, 0, [0]) + self.summary_writer = SummaryWriterWithAD( + os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), + self.alert_rules, + unique_id, + anomaly_inform, + self.anomaly_data_factory) # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) @@ -173,6 +192,10 @@ class TrainerMon: self.print_struct = self.config.get("print_struct", False) self.struct_printed = False self.module_struct = {} + self.call_id = 0 # 用以对hook调用顺序进行排序 + self.anomaly_data_writer = AnomalyDataWriter( + os.path.join(output_base_dir, "anomaly_detected"), rank) + self.anomaly_data_writer.init_detected_json() return def __del__(self): @@ -219,10 +242,11 @@ class TrainerMon: self.hook_optimizer() return - def monitor_gnorm_with_ad(self, model, grad_acc_steps): + def monitor_gnorm_with_ad(self, model, grad_acc_steps, process_group=None): + self.micro_batch_number = grad_acc_steps + self._hook_weights(model) self.hook_optimizer() - self.micro_batch_number = grad_acc_steps def build_tbtag_tensor_map(self, module_name, tag, tensor): metrics = {} @@ -278,10 +302,11 @@ class TrainerMon: for name in self.param2name.values(): context = self.grad_context[name] + # 将当前parameter的call_id vpp_stage等信息告知异常生成类 + self.anomaly_data_factory.set_context(context) for metric_name in self.ops: write_metrics_tensorboard(metric_name, self.summary_writer, context.pre, step) write_metrics_tensorboard(metric_name, self.summary_writer, context.post, step) - context.reset() def hook_optimizer(self): @@ -371,7 +396,8 @@ class TrainerMon: write_metrics_tensorboard(metric_name, self.summary_writer, context.metric_list, context.step) context.metric_list.clear() context.step += 1 - + self.anomaly_data_writer.write_detected_json(self.summary_writer.get_anomalies()) + self.call_id = 0 return if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list): register_optimizer_step_pre_hook(optimizer_pre_step_hook) @@ -484,22 +510,41 @@ class TrainerMon: def _hook_weights(self, model): self.wg_distribution = True - rank = dist.get_rank() if dist.is_initialized() else None def param_hook(grad, context): with torch.no_grad(): context.grad_acc += grad + context.call_id = self.call_id + self.call_id += 1 + + def register_hooks(model_chunk, vpp_stage=None): + for param_name, param in model_chunk.named_parameters(): + prefix = "" if vpp_stage is None else f"{vpp_stage}_" + name = prefix + param_name + for target in self.config['targets'].keys(): + context = self.grad_context[name] + context.vpp_stage = 0 if vpp_stage is None else vpp_stage + if param_name.startswith(target) and param.requires_grad: + self._smallest_rank_print(f'>> monitoring: {name}') + self.param2name[param] = name + param.register_hook(partial(param_hook, context=context)) + context.grad_acc = torch.zeros_like(param).to(DEVICE) + if self.print_struct: self.module_struct = { module_name: 1. for module_name, module in model.named_modules()} return - for name, param in model.named_parameters(): - for target in self.config['targets'].keys(): - context = self.grad_context[name] - if name.startswith(target) and param.requires_grad: - self._smallest_rank_print(f'>> monitoring: {name}') - self.param2name[param] = name - param.register_hook(partial(param_hook, context=context)) - context.grad_acc = torch.zeros_like(param).to(DEVICE) + if isinstance(model, list): + if len(model) > 1: + self.vpp = True + self._smallest_rank_print('vpp enabled') + + for index, model_chunk in enumerate(model): + vpp_stage = index if self.vpp else 0 + register_hooks(model_chunk, vpp_stage=vpp_stage) + + else: + register_hooks(model) + \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index e09536b072c..9415fdafcfe 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -134,12 +134,12 @@ class IdentMetric(Metric): return tensor @staticmethod - def metric_tensorboard(metric_name, summary_writer, metric_value, step): #metric_value is a dict, key is parameter name and value is a list of scalar tensor + def metric_tensorboard(metric_name, summary_writer, metric_value, context): #metric_value is a dict, key is parameter name and value is a list of scalar tensor if len(metric_value) == 1: for key, value in metric_value[0][metric_name].items(): if not value: continue - summary_writer.add_scalar(f'{key}_identical', value.item(), step) + summary_writer.add_scalar(f'{key}_identical', value.item(), context) def get_metrics(metric_name, tag2tensor, eps): @@ -150,9 +150,9 @@ def get_metrics(metric_name, tag2tensor, eps): raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e -def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step): +def write_metrics_tensorboard(metric_name, summary_writer, metric_value, context): try: fun_metric = config_metric_registry[metric_name] - return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) + return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, context) except KeyError as e: raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e -- Gitee From 403e49e91ac76ea09d26ede798d1c990ff267f6f Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 11:18:39 +0000 Subject: [PATCH 15/94] support backward only --- .../accuracy_tools/kj600/kj600/module_hook.py | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 1b658f33e05..6246746bb94 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -42,8 +42,8 @@ class ModuleHookContext: self.format_by_arg[key_name] = target_config[self.module_name][key_name] elif key_name in ['input', 'input_grad']: self.ignore_in = True - else: - raise KeyError(f"Missing key: {key_name} of {self.module_name} in config.json") + # else: + # raise KeyError(f"Missing key: {key_name} of {self.module_name} in config.json") class OptimizerContext: @@ -121,6 +121,7 @@ class TrainerMon: self.forward_only = self.config.get('forward_only', False) if self.forward_only: print_rank_0("> only module forward is monitored. ") + self.backward_only = self.config.get('backward_only', False) self.ur_distribution = self.config.get('ur_distribution', False) if not self.ur_distribution: @@ -255,6 +256,7 @@ class TrainerMon: self.hook_optimizer() self.micro_batch_number = grad_acc_steps self.wg_distribution = True + self.backward_only = True if self.print_struct: self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)} @@ -300,15 +302,18 @@ class TrainerMon: if not self.xy_distribution: return for _, fwd_context in self.module_fwd_hook_context_by_module.items(): - print(fwd_context.actv) if not len(fwd_context.actv) == self.micro_batch_number: print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") + if len(bwd_context.actv) == 0: + continue self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv') fwd_context.actv.clear() for _, bwd_context in self.module_bwd_hook_context_by_module.items(): if not len(bwd_context.actvgrad) == self.micro_batch_number: print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") + if len(bwd_context.actvgrad) == 0: + continue self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') bwd_context.actvgrad.clear() @@ -431,19 +436,21 @@ class TrainerMon: def _register_param_name(self, model): if self.param_registered: return - if isinstance(model, list): - if len(model) > 1: - self.vpp = True - self._smallest_rank_print('vpp enabled') - - for vpp_stage, model_chunk in enumerate(model): - prefix = f'{vpp_stage}_' if self.vpp else '' - for param_name, param in model_chunk.named_parameters(): - name = prefix + param_name - for target in self.config['targets'].keys(): - if param_name.startswith(target) and param.requires_grad: - self._smallest_rank_print(f'>> monitoring: {name}') - self.param2name[param] = name + if not isinstance(model, list): + model = [model] + + if len(model) > 1: + self.vpp = True + self._smallest_rank_print('vpp enabled') + + for vpp_stage, model_chunk in enumerate(model): + prefix = f'{vpp_stage}_' if self.vpp else '' + for param_name, param in model_chunk.named_parameters(): + name = prefix + param_name + for target in self.config['targets'].keys(): + if param_name.startswith(target) and param.requires_grad: + self._smallest_rank_print(f'>> monitoring: {name}') + self.param2name[param] = name self.param_registered = True @@ -502,6 +509,8 @@ class TrainerMon: if not context.format_by_arg: context.set_format_by_arg('input_grad', self.config['targets']) context.set_format_by_arg('output_grad', self.config['targets']) + if not context.format_by_arg: + return if not context.verified: if not context.ignore_in: context.focused_in_col = validate_config_spec(context.format_by_arg['input_grad'], input_grad, context.module_name, 'input_grad') @@ -532,8 +541,9 @@ class TrainerMon: for name, submodule in module.named_modules(): self.module_struct[name] = {} if name in target_names: - submodule.register_forward_hook(fwd_hook_fun) - self.module_fwd_hook_context_by_module[submodule] = ModuleHookContext(name) + if not self.backward_only: + submodule.register_forward_hook(fwd_hook_fun) + self.module_fwd_hook_context_by_module[submodule] = ModuleHookContext(name) if not self.forward_only: submodule.register_full_backward_hook(bwd_hook_fun) self.module_bwd_hook_context_by_module[submodule] = ModuleHookContext(name) -- Gitee From 9949fdbe08f11da4637ff7e0da16548fe22151ac Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 11:21:18 +0000 Subject: [PATCH 16/94] update readme --- debug/accuracy_tools/kj600/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index 6acf189bbae..be4b0fc1b0d 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -62,16 +62,18 @@ hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=args.global_batch_size/ 3. 在json文件中配置工具 ``` { - "targets": { - "module.language_model.encoder.layers.0": {} + "targets": { + "module": {}, + "module.module.language_model.encoder.layers.0": {"input_grad":"tuple[1]:0", "output_grad":"tuple[2]:0"} }, "print_struct": false, # 若不了解模型结构,可以打开print_struct打印模型结构 "module_ranks": [0,1,2,3], # 需要监控的rank "wg_distribution": true, + "format": "csv", "alert": { "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}] }, - "ops": ["norm"], + "ops": ["norm", "min", "max", "mean"], "eps": 1e-8 } ``` -- Gitee From 4a015a38801104980a490cd9c60df82bd0803436 Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 11:26:51 +0000 Subject: [PATCH 17/94] update readme --- debug/accuracy_tools/kj600/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index be4b0fc1b0d..874cfaea31d 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -39,8 +39,8 @@ pip install . 模型训练状态的异常通常会反映在loss和梯度上,通过对模型各个模块梯度的监控,可以帮助快速定位异常的第一现场。 -1. 配置tensorboard写入的目录 -监控结果写入tensorboard的event文件中,设置输出路径(默认为`kj600_output`) +1. 输出目录 +监控结果写入tensorboard的event文件/csv中,设置输出路径(默认为`kj600_output`) ```bash export KJ600_OUTPUT_DIR=/xxx/output_dir -- Gitee From 6ac2582a95b16b46b901b033d89742cc0b67831e Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 5 Aug 2024 11:34:38 +0000 Subject: [PATCH 18/94] add metric mean --- debug/accuracy_tools/kj600/kj600/features.py | 4 ++++ debug/accuracy_tools/kj600/kj600/module_metric.py | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/features.py b/debug/accuracy_tools/kj600/kj600/features.py index 7810188f7d7..09b48cffdaf 100644 --- a/debug/accuracy_tools/kj600/kj600/features.py +++ b/debug/accuracy_tools/kj600/kj600/features.py @@ -11,6 +11,10 @@ def square_sum(x: torch.tensor): def get_min(x: torch.tensor): return torch.min(x) +@torch.no_grad() +def get_mean(x: torch.tensor): + return torch.mean(x) + @torch.no_grad() def get_norm(x: torch.tensor): return torch.norm(x, p=2) diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 57934ed8199..6a59af89b19 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -1,7 +1,7 @@ import math import statistics -from kj600.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm +from kj600.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm, get_mean def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): @@ -75,6 +75,19 @@ class MinMetric(Metric): summary_writer.add_scalar(f'{key}_min', min_value, step) +@register_config_metric("mean") +class MeanMetric(Metric): + @staticmethod + def get_metric_value(tensor, eps): + return get_mean(tensor) + + @staticmethod + def metric_tensorboard(metric_name, summary_writer, metric_value, step): + for key in metric_value[0][metric_name].keys(): + mean_value = sum([item[metric_name][key].item() for item in metric_value]) / len(metric_value) + summary_writer.add_scalar(f'{key}_mean', mean_value, step) + + @register_config_metric("max") class MaxMetric(Metric): @staticmethod -- Gitee From c2e31e17cb926117e7f4364aa52e2b27e47abe6a Mon Sep 17 00:00:00 2001 From: jiandaobao Date: Mon, 5 Aug 2024 20:07:04 +0800 Subject: [PATCH 19/94] Add anomaly analyse module. --- .../accuracy_tools/kj600/kj600/module_hook.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 6108fcde55f..1e91c5a0a93 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -154,27 +154,32 @@ class TrainerMon: output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] + if dist.is_initialized(): rank = dist.get_rank() + tensorboard_dir = os.path.join(output_base_dir, f"{cur_time}-rank{rank}-{unique_id}") pp_stage = dist.get_group_rank(self.process_group, rank) group_mates = dist.get_process_group_ranks(self.process_group) - if (rank in self.module_rank_list) or len(self.module_rank_list) == 0: - self.anomaly_data_factory = AnomalyDataFactory(rank, pp_stage, group_mates) - self.summary_writer = SummaryWriterWithAD( - os.path.join(output_base_dir, f"{cur_time}-rank{rank}-{unique_id}"), - self.alert_rules, - unique_id, - anomaly_inform, - self.anomaly_data_factory) else: rank = 0 - self.anomaly_data_factory = AnomalyDataFactory(rank, 0, [0]) + tensorboard_dir = os.path.join(output_base_dir, f"{cur_time}-{unique_id}") + pp_stage = 0 + group_mates = [0] + + if (rank in self.module_rank_list) or len(self.module_rank_list) == 0: + # 初始化AnomalyData工厂 + self.anomaly_data_factory = AnomalyDataFactory(rank, pp_stage, group_mates) self.summary_writer = SummaryWriterWithAD( - os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), + tensorboard_dir, self.alert_rules, unique_id, anomaly_inform, self.anomaly_data_factory) + # 初始化anomaly deteted文件目录 + self.anomaly_data_writer = AnomalyDataWriter( + os.path.join(output_base_dir, "anomaly_detected"), rank) + self.anomaly_data_writer.init_detected_json() + # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) @@ -193,9 +198,6 @@ class TrainerMon: self.struct_printed = False self.module_struct = {} self.call_id = 0 # 用以对hook调用顺序进行排序 - self.anomaly_data_writer = AnomalyDataWriter( - os.path.join(output_base_dir, "anomaly_detected"), rank) - self.anomaly_data_writer.init_detected_json() return def __del__(self): -- Gitee From e4e0fa8cfd97359e87aa83c1628e0e8671a7870c Mon Sep 17 00:00:00 2001 From: jiandaobao Date: Mon, 5 Aug 2024 22:19:11 +0800 Subject: [PATCH 20/94] --amend --- .../kj600/kj600/anomaly_analyse.py | 134 ++++++++++++++++-- 1 file changed, 123 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py index 737ec26dd6e..d34e991fc27 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py @@ -15,20 +15,29 @@ # limitations under the License. """ -import os +import argparse import fcntl +import heapq import json +import os from pathlib import Path +import sys + from kj600.utils import print_info_log +from kj600.anomaly_detect import GradAnomalyData from kj600.file_check import ( change_mode, + check_link, FileCheckConst, check_path_before_create, FileChecker, + FileOpen ) - class AnomalyDataWriter: + """ + 异常数据写入类,负责将异常数据写入到JSON文件中。 + """ def __init__(self, dump_path, rank) -> None: self.dump_path = dump_path @@ -56,23 +65,25 @@ class AnomalyDataWriter: change_mode(self.json_path, FileCheckConst.DATA_FILE_AUTHORITY) def write_detected_json(self, anomalies): - anomalies_json = self._get_anomaly_dict(anomalies) - print_info_log(f"detected.json is at {self.dump_rank_dir}. ") + """落盘异常数据 + + 参数: + anomalies: GradAnomalyData对象列表 + """ + anomalies_json = self.get_anomaly_dict(anomalies) + print_info_log(f"anomaly.json is at {self.dump_rank_dir}. ") if Path(self.json_path).exists() and os.path.getsize(self.json_path) > 0: - with open(self.json_path, "r+") as f: + with FileOpen(self.json_path, "r+") as f: fcntl.flock(f, fcntl.LOCK_EX) data_to_write = json.load(f) fcntl.flock(f, fcntl.LOCK_UN) else: data_to_write = {} data_to_write.update(anomalies_json) - with open(self.json_path, "w+") as f: - fcntl.flock(f, fcntl.LOCK_EX) - json.dump(data_to_write, f, indent=1) - fcntl.flock(f, fcntl.LOCK_UN) + self.write_data_in_single_json(self.json_path, data_to_write) @staticmethod - def _get_anomaly_dict(anomalies): + def get_anomaly_dict(anomalies): """将GradAnomalyData列表转换为json """ anomalies_json = {} @@ -80,5 +91,106 @@ class AnomalyDataWriter: anomalies_json.update({anomaly.get_key(): anomaly.to_dict()}) return anomalies_json + @staticmethod + def write_data_in_single_json(json_path, anomalies_data): + with FileOpen(json_path, "w+") as f: + fcntl.flock(f, fcntl.LOCK_EX) + json.dump(anomalies_data, f, indent=1) + fcntl.flock(f, fcntl.LOCK_UN) + + class AnomalyDataLoader: - pass \ No newline at end of file + def __init__(self, data_path) -> None: + self.data_path = data_path + + def get_anomalies_from_jsons(self): + """遍历文件夹,从rankK/anomaly.json中读取异常数据 + return: anomalies: GradAnomalyData对象列表 + """ + anomalies = [] + check_link(self.data_path) + for rank_dir in os.listdir(self.data_path): + rank_path = os.path.join(self.data_path, rank_dir) + if not os.path.isdir(rank_path): + continue + json_path = os.path.join(rank_path, "anomaly.json") + if not os.path.exists(json_path): + continue + file_check = FileChecker(json_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE) + file_check.common_check() + with FileOpen(json_path, 'r+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + data_anomalies = json.load(f) + fcntl.flock(f, fcntl.LOCK_UN) + instances = self.create_instances_from_dict(data_anomalies) + anomalies.extend(instances) + return anomalies + + def create_instances_from_dict(self, anomalies_dict: dict): + instances = [] + for values in anomalies_dict.values(): + try: + instances.append(GradAnomalyData(**values)) + except KeyError as e: + print_info_log(f"Missing key in anomaly data: {e}") + except ValueError as e: + print_info_log(f"Value error when creating a GradAnomalyData instance: {e}") + return instances + +class AnomalyAnalyse: + def __init__(self) -> None: + self.sorted_anomalies = [] + + def get_range_top_K(self, topk, step_list, anomalies): + """ + 获取前topk个step_list范围内的异常。 + """ + filtered_anomalies = [anomaly for anomaly in anomalies if anomaly.step in step_list] + if topk >= len(filtered_anomalies): + self.sorted_anomalies = filtered_anomalies + else: + self.sorted_anomalies = list(heapq.nsmallest(topk, filtered_anomalies)) + return self.sorted_anomalies + + def rewrite_sorted_anomalies(self, output_path): + """ + 将排序后的异常数据重新落盘 + """ + file_check = FileChecker(output_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + file_check.common_check() + + sorted_data = AnomalyDataWriter.get_anomaly_dict(self.sorted_anomalies) + json_path =os.path.join(output_path, "anomaly_analyse.json") + if os.path.exists(json_path): + os.remove(json_path) + Path(json_path).touch() + change_mode(json_path, FileCheckConst.DATA_FILE_AUTHORITY) + AnomalyDataWriter.write_data_in_single_json(json_path, sorted_data) + +def _get_parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--data_path", dest="data_path_dir", default="", type=str, + help=" The anomaly detect result dictionary: generate from kj600 tool.", + required=True) + parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str, + help=" The analyse task result out path.", + required=False) + parser.add_argument('-k', "--topk", dest="top_k_number", default=8, + help=" Top K number of earliest anomalies.", required=False) + parser.add_argument("-s", "--step", dest="step_list", default=[], type=list, + help=" Analyse which steps.", required=False) + return parser.parse_args(sys.argv[1:]) + +def _anomaly_analyse(): + args = _get_parse_args() + loader = AnomalyDataLoader(args.data_path_dir) + anomalies = loader.get_anomalies_from_jsons() + analyser = AnomalyAnalyse() + top_anomalies = analyser.get_range_top_K(args.top_k_number, args.step_list, anomalies) + analyser.rewrite_sorted_anomalies(args.out_path if args.out_path else args.data_path_dir) + for index, anomaly in enumerate(top_anomalies): + print_info_log(f"{index}: {anomaly.message}") + +if __name__=='__main__': + _anomaly_analyse() + print_info_log("Analyse task completed.") \ No newline at end of file -- Gitee From ca538c0bd0e6592fcf1e7de788aea325e22f8b8a Mon Sep 17 00:00:00 2001 From: jiandaobao Date: Tue, 6 Aug 2024 09:13:44 +0800 Subject: [PATCH 21/94] Modify the compare function. --- .../kj600/kj600/anomaly_analyse.py | 11 +++++++--- .../kj600/kj600/anomaly_detect.py | 20 +++++++++---------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py index d34e991fc27..119e35ca0e8 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py @@ -145,9 +145,12 @@ class AnomalyAnalyse: """ 获取前topk个step_list范围内的异常。 """ - filtered_anomalies = [anomaly for anomaly in anomalies if anomaly.step in step_list] + if not step_list: + filtered_anomalies = anomalies + else: + filtered_anomalies = [anomaly for anomaly in anomalies if anomaly.step in step_list] if topk >= len(filtered_anomalies): - self.sorted_anomalies = filtered_anomalies + self.sorted_anomalies = sorted(filtered_anomalies) else: self.sorted_anomalies = list(heapq.nsmallest(topk, filtered_anomalies)) return self.sorted_anomalies @@ -175,7 +178,7 @@ def _get_parse_args(): parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str, help=" The analyse task result out path.", required=False) - parser.add_argument('-k', "--topk", dest="top_k_number", default=8, + parser.add_argument('-k', "--topk", dest="top_k_number", default=8, type=int, help=" Top K number of earliest anomalies.", required=False) parser.add_argument("-s", "--step", dest="step_list", default=[], type=list, help=" Analyse which steps.", required=False) @@ -188,6 +191,8 @@ def _anomaly_analyse(): analyser = AnomalyAnalyse() top_anomalies = analyser.get_range_top_K(args.top_k_number, args.step_list, anomalies) analyser.rewrite_sorted_anomalies(args.out_path if args.out_path else args.data_path_dir) + + print_info_log(f"Top {args.top_k_number} anomalies are listed as follows:") for index, anomaly in enumerate(top_anomalies): print_info_log(f"{index}: {anomaly.message}") diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 0ccb481ab0f..bc066a19a13 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -108,16 +108,16 @@ class GradAnomalyData: def __lt__(self, other): if not isinstance(other, GradAnomalyData): return NotImplemented - if self.step < other.step: - return True - if self.micro_step < other.micro_step: - return True - if self.pp_stage > other.pp_stage: - return True - if self.vpp_stage > other.vpp_stage: - return True - if self.call_id < other.call_id: - return True + if self.step != other.step: + return self.step < other.step + if self.micro_step != other.micro_step: + return self.micro_step < other.micro_step + if self.pp_stage != other.pp_stage: + return self.pp_stage > other.pp_stage + if self.vpp_stage != other.vpp_stage: + return self.vpp_stage > other.vpp_stage + if self.call_id != other.call_id: + return self.call_id < other.call_id return False def __le__(self, other): -- Gitee From 9f6fca5863b035454bc9b0d785d6e8932f5f36f3 Mon Sep 17 00:00:00 2001 From: jiandaobao Date: Tue, 6 Aug 2024 09:26:54 +0800 Subject: [PATCH 22/94] Copy FilerChecker from msporbe. --- .../accuracy_tools/kj600/kj600/file_check.py | 324 ++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 debug/accuracy_tools/kj600/kj600/file_check.py diff --git a/debug/accuracy_tools/kj600/kj600/file_check.py b/debug/accuracy_tools/kj600/kj600/file_check.py new file mode 100644 index 00000000000..21f9e351a2f --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/file_check.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import re + +from kj600.utils import print_info_log + + +class CodedException(Exception): + def __init__(self, code, error_info=""): + super().__init__() + self.code = code + self.error_info = self.err_strs.get(code) + error_info + + def __str__(self): + return self.error_info + + +class FileCheckException(CodedException): + INVALID_FILE_ERROR = 0 + FILE_PERMISSION_ERROR = 1 + SOFT_LINK_ERROR = 2 + ILLEGAL_PATH_ERROR = 3 + ILLEGAL_PARAM_ERROR = 4 + FILE_TOO_LARGE_ERROR = 5 + + err_strs = { + SOFT_LINK_ERROR: "[kj600] 检测到软链接: ", + FILE_PERMISSION_ERROR: "[kj600] 文件权限错误: ", + INVALID_FILE_ERROR: "[kj600] 无效文件: ", + ILLEGAL_PATH_ERROR: "[kj600] 非法文件路径: ", + ILLEGAL_PARAM_ERROR: "[kj600] 非法打开方式: ", + FILE_TOO_LARGE_ERROR: "[kj600] 文件过大: ", + } + + +class FileCheckConst: + """ + Class for file check const + """ + + READ_ABLE = "read" + WRITE_ABLE = "write" + READ_WRITE_ABLE = "read and write" + DIRECTORY_LENGTH = 4096 + FILE_NAME_LENGTH = 255 + FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$" + FILE_PATTERN = r"^[a-zA-Z0-9_./-]+$" + JSON_SUFFIX = ".json" + MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 + DIR = "dir" + FILE = "file" + DATA_DIR_AUTHORITY = 0o750 + DATA_FILE_AUTHORITY = 0o640 + FILE_SIZE_DICT = { + JSON_SUFFIX: MAX_JSON_SIZE, + } + + +class FileChecker: + """ + The class for check file. + + Attributes: + file_path: The file or dictionary path to be verified. + path_type: file or dictionary + ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability + file_type(str): The correct file type for file + """ + + def __init__( + self, file_path, path_type, ability=None, file_type=None, is_script=True + ): + self.file_path = file_path + self.path_type = self._check_path_type(path_type) + self.ability = ability + self.file_type = file_type + self.is_script = is_script + + @staticmethod + def _check_path_type(path_type): + if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]: + print_info_log( + f"The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}." + ) + raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) + return path_type + + def common_check(self): + """ + 功能:用户校验基本文件权限:软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符 + 注意:文件后缀的合法性,非通用操作,可使用其他独立接口实现 + """ + check_path_exists(self.file_path) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + check_path_type(self.file_path, self.path_type) + self.check_path_ability() + if self.is_script: + check_path_owner_consistent(self.file_path) + check_path_pattern_vaild(self.file_path) + check_common_file_size(self.file_path) + check_file_suffix(self.file_path, self.file_type) + return self.file_path + + def check_path_ability(self): + if self.ability == FileCheckConst.WRITE_ABLE: + check_path_writability(self.file_path) + if self.ability == FileCheckConst.READ_ABLE: + check_path_readability(self.file_path) + if self.ability == FileCheckConst.READ_WRITE_ABLE: + check_path_readability(self.file_path) + check_path_writability(self.file_path) + + +class FileOpen: + """ + The class for open file by a safe way. + + Attributes: + file_path: The file or dictionary path to be opened. + mode(str): The file open mode + """ + + SUPPORT_READ_MODE = ["r", "rb"] + SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"] + SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"] + + def __init__(self, file_path, mode, encoding="utf-8"): + self.file_path = file_path + self.mode = mode + self.encoding = encoding + self._handle = None + + def __enter__(self): + self.check_file_path() + binary_mode = "b" + if binary_mode not in self.mode: + self._handle = open(self.file_path, self.mode, encoding=self.encoding) + else: + self._handle = open(self.file_path, self.mode) + return self._handle + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._handle: + self._handle.close() + + def check_file_path(self): + support_mode = ( + self.SUPPORT_READ_MODE + + self.SUPPORT_WRITE_MODE + + self.SUPPORT_READ_WRITE_MODE + ) + if self.mode not in support_mode: + print_info_log("File open not support %s mode" % self.mode) + raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + self.check_ability_and_owner() + check_path_pattern_vaild(self.file_path) + if os.path.exists(self.file_path): + check_common_file_size(self.file_path) + + def check_ability_and_owner(self): + if self.mode in self.SUPPORT_READ_MODE: + check_path_exists(self.file_path) + check_path_readability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path): + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path): + check_path_readability(self.file_path) + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + + +def check_link(path): + abs_path = os.path.abspath(path) + if os.path.islink(abs_path): + print_info_log("The file path {} is a soft link.".format(path)) + raise FileCheckException(FileCheckException.SOFT_LINK_ERROR) + + +def check_path_length(path, name_length=None): + file_max_name_length = ( + name_length if name_length else FileCheckConst.FILE_NAME_LENGTH + ) + if ( + len(path) > FileCheckConst.DIRECTORY_LENGTH + or len(os.path.basename(path)) > file_max_name_length + ): + print_info_log("The file path length exceeds limit.") + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_path_exists(path): + if not os.path.exists(path): + print_info_log("The file path %s does not exist." % path) + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_path_readability(path): + if not os.access(path, os.R_OK): + print_info_log("The file path %s is not readable." % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_writability(path): + if not os.access(path, os.W_OK): + print_info_log("The file path %s is not writable." % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_executable(path): + if not os.access(path, os.X_OK): + print_info_log("The file path %s is not executable." % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_other_user_writable(path): + st = os.stat(path) + if st.st_mode & 0o002: + print_info_log( + "The file path %s may be insecure because other users have write permissions. " + % path + ) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_owner_consistent(path): + file_owner = os.stat(path).st_uid + if file_owner != os.getuid(): + print_info_log( + "The file path %s may be insecure because is does not belong to you." % path + ) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_pattern_vaild(path): + if not re.match(FileCheckConst.FILE_VALID_PATTERN, path): + print_info_log("The file path %s contains special characters." % (path)) + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_file_size(file_path, max_size): + file_size = os.path.getsize(file_path) + if file_size >= max_size: + print_info_log(f"The size of file path {file_path} exceeds {max_size} bytes.") + raise FileCheckException(FileCheckException.FILE_TOO_LARGE_ERROR) + + +def check_common_file_size(file_path): + if os.path.isfile(file_path): + for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): + if file_path.endswith(suffix): + check_file_size(file_path, max_size) + break + + +def check_file_suffix(file_path, file_suffix): + if file_suffix: + if not file_path.endswith(file_suffix): + print_info_log(f"The {file_path} should be a {file_suffix} file!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + + +def check_path_type(file_path, file_type): + if file_type == FileCheckConst.FILE: + if not os.path.isfile(file_path): + print_info_log(f"The {file_path} should be a file!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + if file_type == FileCheckConst.DIR: + if not os.path.isdir(file_path): + print_info_log(f"The {file_path} should be a dictionary!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + + +def check_path_before_create(path): + if path_len_exceeds_limit(path): + raise FileCheckException( + FileCheckException.ILLEGAL_PATH_ERROR, "The file path length exceeds limit." + ) + + if not re.match(FileCheckConst.FILE_PATTERN, os.path.realpath(path)): + raise FileCheckException( + FileCheckException.ILLEGAL_PATH_ERROR, + "The file path {} contains special characters.".format(path), + ) + + +def change_mode(path, mode): + if not os.path.exists(path) or os.path.islink(path): + return + try: + os.chmod(path, mode) + except PermissionError as ex: + raise FileCheckException( + FileCheckException.FILE_PERMISSION_ERROR, + "Failed to change {} authority. {}".format(path, str(ex)), + ) from ex + + +def path_len_exceeds_limit(file_path): + return ( + len(os.path.realpath(file_path)) > FileCheckConst.DIRECTORY_LENGTH + or len(os.path.basename(file_path)) > FileCheckConst.FILE_NAME_LENGTH + ) -- Gitee From 72c2366115568ffdc758eb2aa7698e277481d03e Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 6 Aug 2024 03:14:16 +0000 Subject: [PATCH 23/94] expend columns with metric tag --- debug/accuracy_tools/kj600/kj600/module_metric.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 6a59af89b19..8145e5d68d4 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -182,5 +182,11 @@ def write_metrics_csv(ops, summary_writer, metric_value, step, preifx=''): raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e if not summary_writer.header: - summary_writer.header = ['param_name'] + ops + keys = metric_value[0][metric_name].keys() + if len(keys) > 1: + summary_writer.header = ['param_name'] + for key in keys: + summary_writer.header.extend([f'{key.split("/")[-1]}_{op}' for op in ops]) + else: + summary_writer.header = ['param_name'] + ops summary_writer.write_csv(preifx, step) \ No newline at end of file -- Gitee From 0d7fd5809dcf973f0ef11c2fc2797dbe29d45d2c Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 6 Aug 2024 03:15:33 +0000 Subject: [PATCH 24/94] avoid warning --- debug/accuracy_tools/kj600/kj600/module_hook.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 6246746bb94..219dbf95f51 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -302,18 +302,18 @@ class TrainerMon: if not self.xy_distribution: return for _, fwd_context in self.module_fwd_hook_context_by_module.items(): - if not len(fwd_context.actv) == self.micro_batch_number: - print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") if len(bwd_context.actv) == 0: continue + if not len(fwd_context.actv) == self.micro_batch_number: + print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv') fwd_context.actv.clear() for _, bwd_context in self.module_bwd_hook_context_by_module.items(): - if not len(bwd_context.actvgrad) == self.micro_batch_number: - print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") if len(bwd_context.actvgrad) == 0: continue + if not len(bwd_context.actvgrad) == self.micro_batch_number: + print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') bwd_context.actvgrad.clear() -- Gitee From e7aa0f43444a45909687f6951df3bf77dfeaa490 Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 6 Aug 2024 06:05:18 +0000 Subject: [PATCH 25/94] update --- debug/accuracy_tools/kj600/kj600/module_hook.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 219dbf95f51..370cffc2b0c 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -85,7 +85,6 @@ class GradContext: self.post = [] self.acc_metric = [] self.acc = {} - self.micro_step = -1 def reset(self): self.pre.clear() @@ -258,9 +257,6 @@ class TrainerMon: self.wg_distribution = True self.backward_only = True - if self.print_struct: - self.module_struct = {vpp_stage:[module_name for module_name, module in model_chunk.named_modules()] for vpp_stage, model_chunk in enumerate(model)} - return self._register_param_name(model) self._hook_model_for_grad_acc(model) # self._hook_weights() -- Gitee From 488a29631943d0c33037e1463b128349097a36d0 Mon Sep 17 00:00:00 2001 From: wuyulong11 <2284273586@qq.com> Date: Thu, 1 Aug 2024 17:22:54 +0800 Subject: [PATCH 26/94] =?UTF-8?q?=E5=89=8D=E7=AB=AF=E6=A1=86=E6=9E=B6?= =?UTF-8?q?=E6=90=AD=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tb_graph_ascend/fe/.gitignore | 25 + .../tb_graph_ascend/fe/.prettierrc | 12 + .../tb_graph_ascend/fe/index.html | 32 + .../tb_graph_ascend/fe/package.json | 25 + .../tb_graph_ascend/fe/public/vite.svg | 1 + .../tb_graph_ascend/fe/src/tb-graph-app.ts | 601 ++++++++++++++++++ .../tb_graph_ascend/fe/src/vite-env.d.ts | 1 + .../tb_graph_ascend/fe/tsconfig.json | 28 + .../tb_graph_ascend/fe/vite.config.js | 6 + .../tb_graph_ascend/setup.py | 79 +++ 10 files changed, 810 insertions(+) create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/.gitignore create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/.prettierrc create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/index.html create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/package.json create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/public/vite.svg create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/src/tb-graph-app.ts create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/src/vite-env.d.ts create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/tsconfig.json create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/fe/vite.config.js create mode 100644 plugins/tensorboard-plugins/tb_graph_ascend/setup.py diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/.gitignore b/plugins/tensorboard-plugins/tb_graph_ascend/fe/.gitignore new file mode 100644 index 00000000000..0c763f704a0 --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/.gitignore @@ -0,0 +1,25 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local +*.lock + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/.prettierrc b/plugins/tensorboard-plugins/tb_graph_ascend/fe/.prettierrc new file mode 100644 index 00000000000..b2a5911b10a --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/.prettierrc @@ -0,0 +1,12 @@ +{ + "parser": "typescript", + "semi": true, + "singleQuote": true, + "jsxSingleQuote": false, + "bracketSpacing": true, + "tabWidth": 2, + "useTabs": false, + "trailingComma": "none", + "proseWrap": "always", + "endOfLine": "lf" +} \ No newline at end of file diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/index.html b/plugins/tensorboard-plugins/tb_graph_ascend/fe/index.html new file mode 100644 index 00000000000..94470cf1512 --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/index.html @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/package.json b/plugins/tensorboard-plugins/tb_graph_ascend/fe/package.json new file mode 100644 index 00000000000..c2f6e3a21f4 --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/package.json @@ -0,0 +1,25 @@ +{ + "name": "tbgraph", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "preview": "vite preview" + }, + "dependencies": { + "lit": "^3.1.4", + "@types/d3": "^7.4.3", + "@types/lodash": "^4.17.1" + }, + "devDependencies": { + "@types/dagre": "^0.7.52", + "d3": "^7.9.0", + "dagre": "^0.8.5", + "lodash": "^4.17.21", + "typescript": "^5.2.2", + "vite": "^5.3.1", + "vite-plugin-singlefile": "^2.0.2" + } +} diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/public/vite.svg b/plugins/tensorboard-plugins/tb_graph_ascend/fe/public/vite.svg new file mode 100644 index 00000000000..e7b8dfb1b2a --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/public/vite.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/src/tb-graph-app.ts b/plugins/tensorboard-plugins/tb_graph_ascend/fe/src/tb-graph-app.ts new file mode 100644 index 00000000000..3b879f2856c --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/src/tb-graph-app.ts @@ -0,0 +1,601 @@ +/* + *-------------------------------------------------------------------------------------------- + * Copyright 2020 The TensorFlow Authors. + * Copyright (c) 2024, Huawei Technologies. + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *--------------------------------------------------------------------------------------------* + */ + +import { LitElement, css, html, PropertyValueMap } from 'lit'; +import { customElement, property } from 'lit/decorators'; +import './components/dashboard_common/dashboard-layout'; + +import { observe } from '@polymer/decorators'; +import '../polymer/irons_and_papers'; +import { Canceller } from './backend/canceller'; +import { RequestManager } from './backend/requestManager'; +import { getRouter } from './backend/router'; +import '../dashboard_common/tf-dashboard-layout'; +import * as storage from './storage'; +import * as vz_sorting from './vz_sorting/sorting'; +import '../graph_board/tf-graph-board'; +import * as graph_op from './common/op'; +import * as graph_render from './common/render'; +import '../graph_controls/tf-graph-controls'; +import '../graph_loader/tf-graph-dashboard-loader'; + +/** + * The (string) name for the run of the selected dataset in the graph dashboard. + */ +const RUN_STORAGE_KEY = 'run'; + +@customElement('tb-graph-app') +export class TbGraphApp extends LitElement { + /** + * @type {!Array} + */ + @property({ + type: Array + }) + _datasets: any[] = []; + @property({ + type: Boolean + }) + _datasetsFetched: boolean = false; + @property({ + type: Number + }) + _selectedDataset: number = 0; + @property({ type: Object }) + _renderHierarchy: graph_render.RenderGraphInfo; + @property({ + type: Object + }) + _requestManager: RequestManager = new RequestManager(); + @property({ + type: Object + }) + _canceller: Canceller = new Canceller(); + @property({ type: Boolean }) + _debuggerDataEnabled: boolean; + @property({ type: Boolean }) + allStepsModeEnabled: boolean; + @property({ type: Number }) + specificHealthPillStep: number = 0; + @property({ + type: Boolean, + observer: '_healthPillsToggledOnChanged' + }) + healthPillsToggledOn: boolean = false; + @property({ + type: String, + notify: true + }) + selectedNode: string; + @property({ type: Boolean }) + _isAttached: boolean; + // Whether this dashboard is initialized. This dashboard should only be initialized once. + @property({ type: Boolean }) + _initialized: boolean; + // Whether health pills are currently being loaded, in which case we may want to say show a + // spinner. + @property({ type: Boolean }) + _areHealthPillsLoading: boolean; + // An array of alerts (in chronological order) provided by debugging libraries on when bad + // values (NaN, +/- Inf) appear. + @property({ + type: Array, + notify: true + }) + _debuggerNumericAlerts: unknown[] = []; + // Maps the names of nodes to an array of health pills (HealthPillDatums). + @property({ + type: Object + }) + _nodeNamesToHealthPills: object = {}; + @property({ type: Number }) + _healthPillStepIndex: number; + // A strictly increasing ID. Each request for health pills has a unique ID. This helps us + // identify stale requests. + @property({ type: Number }) + _healthPillRequestId: number = 1; + /** + * The setTimeout ID for the pending request for health pills at a + * specific step. + * + * @type {number?} + */ + @property({ type: Number }) _healthPillStepRequestTimerId: number | null; + // The request for health pills at a specific step (as opposed to all sampled health pills) may + // involve slow disk reads. Hence, we throttle to 1 of those requests every this many ms. + @property({ + type: Number + }) + _healthPillStepRequestTimerDelay: number = 500; + @property({ type: Array }) + runs: unknown[]; + @property({ + type: String, + notify: true, + observer: '_runObserver' + }) + run: string = storage + .getStringInitializer(RUN_STORAGE_KEY, { + defaultValue: '', + useLocalStorage: false + }) + .call(this); + @property({ + type: Object + }) + _selection: object; + @property({ type: Object }) + _compatibilityProvider: object; + @property({ type: Boolean }) + _traceInputs: boolean; + @property({ type: Boolean }) + _autoExtractNodes: boolean; + @property({ type: Object }) + _selectedFile: any; + override attached() { + this.set('_isAttached', true); + } + override detached() { + this.set('_isAttached', false); + } + override ready() { + super.ready(); + + this.addEventListener( + 'node-toggle-expand', + this._handleNodeToggleExpand.bind(this) + ); + } + reload() { + if (!this._debuggerDataEnabled) { + // Check if the debugger plugin is enabled now. + this._requestManager.request(getRouter().pluginsListing()).then( + this._canceller.cancellable((result) => { + if (result.cancelled) { + return; + } + if (result.value['debugger']) { + // The debugger plugin is enabled. Request debugger-related + // data. Perhaps the debugger plugin had been disabled + // beforehand because no bad values (NaN, -/+ Inf) had been + // found and muted_if_healthy had been on. + this.set('_debuggerDataEnabled', true); + } + }) + ); + } + this._maybeFetchHealthPills(); + } + _fit() { + (this.$$('#graphboard') as any).fit(); + } + _onDownloadImageRequested(event: CustomEvent) { + (this.$$('#graphboard') as any).downloadAsImage(event.detail as string); + } + _getGraphDisplayClassName(_selectedFile: any, _datasets: any[]) { + const isDataValid = _selectedFile || _datasets.length; + return isDataValid ? '' : 'no-graph'; + } + _runObserver = storage.getStringObserver(RUN_STORAGE_KEY, { + defaultValue: '', + polymerProperty: 'run', + useLocalStorage: false + }); + _fetchDataset() { + return this._requestManager.request('/data/plugin/tb_graph_ascend/info'); + } + /* + * See also _maybeFetchHealthPills, _initiateNetworkRequestForHealthPills. + * This function returns a promise with the raw health pill data. + */ + _fetchHealthPills(nodeNames, step) { + const postData = { + node_names: JSON.stringify(nodeNames), + // Events files with debugger data fall under this special run. + run: '__debugger_data__' + }; + if (step !== undefined) { + // The user requested health pills for a specific step. This request + // might be slow since the backend reads events sequentially from disk. + postData['step'] = step; + } + const url = getRouter().pluginRoute('debugger', '/health_pills'); + return this._requestManager.request(url, postData); + } + _fetchDebuggerNumericsAlerts() { + return this._requestManager.request( + getRouter().pluginRoute('debugger', '/numerics_alert_report') + ); + } + _graphUrl(run, limitAttrSize, largeAttrsKey) { + return getRouter().pluginRouteForSrc( + 'graphs', + '/graph', + new URLSearchParams({ + run: run, + limit_attr_size: limitAttrSize, + large_attrs_key: largeAttrsKey + }) + ); + } + _shouldRequestHealthPills() { + // Do not load debugger data if the feature is disabled, if the user toggled off the feature, + // or if the graph itself has not loaded yet. We need the graph to load so that we know which + // nodes to request health pills for. + return ( + this._debuggerDataEnabled && + this.healthPillsToggledOn && + this._renderHierarchy && + this._datasetsState(this._datasetsFetched, this._datasets, 'PRESENT') + ); + } + @observe('_isAttached') + _maybeInitializeDashboard() { + var isAttached = this._isAttached; + console.log(isAttached); + if (this._initialized && isAttached) { + // Either this dashboard is already initialized ... or we are not yet ready to initialize. + return; + } + this.set('_compatibilityProvider', new graph_op.TpuCompatibilityProvider()); + // Set this to true so we only initialize once. + this._initialized = true; + this._fetchDataset().then((dataset) => { + const runNames = Object.keys(dataset); + // Transform raw data into UI friendly data. + this._datasets = runNames + .sort(vz_sorting.compareTagNames) + .map((runName) => { + const runData = dataset[runName]; + const tagNames = Object.keys(runData.tags).sort( + vz_sorting.compareTagNames + ); + const tags = tagNames + .map((name) => runData.tags[name]) + .map(({ tag, conceptual_graph, op_graph, profile }) => ({ + tag, + displayName: tag, + conceptualGraph: conceptual_graph, + opGraph: op_graph, + profile + })); + // Translate a run-wide GraphDef into specially named (without a tag) op graph + // to abstract the difference between run_graph vs. op_graph from other + // components. + const tagsWithV1Graph = runData.run_graph + ? [ + { + tag: null, + displayName: 'Default', + conceptualGraph: false, + opGraph: true, + profile: false + }, + ...tags + ] + : tags; + return { + name: runName, + tags: tagsWithV1Graph, + isvis: runData.is_vis + }; + }); + this._datasetsFetched = true; + }); + } + @observe('_datasetsFetched', '_datasets', 'run') + _determineSelectedDataset() { + var datasetsFetched = this._datasetsFetched; + var datasets = this._datasets; + var run = this.run; + // By default, load the first dataset. + if (!run) { + // By default, load the first dataset. + this.set('_selectedDataset', 0); + return; + } + // If the URL specifies a dataset, load it. + const dataset = datasets.findIndex((d) => d.name === run); + if (dataset === -1) { + if (datasetsFetched) { + // Tell the user if the dataset cannot be found to avoid misleading + // the user. + const dialog = this.$$('#error-dialog') as any; + dialog.textContent = `No dataset named "${run}" could be found.`; + dialog.open(); + } + return; + } + this.set('_selectedDataset', dataset); + } + @observe('_datasetsFetched', '_datasets', '_selectedDataset') + _updateSelectedDatasetName() { + var datasetsFetched = this._datasetsFetched; + var datasets = this._datasets; + var selectedDataset = this._selectedDataset; + if (!datasetsFetched) return; + // Cannot update `run` to update the hash in case datasets for graph is empty. + if (datasets.length <= selectedDataset) return; + this.set('run', datasets[selectedDataset].name); + } + _requestHealthPills() { + this.set('_areHealthPillsLoading', true); + var requestId = ++this._healthPillRequestId; + if (this._healthPillStepRequestTimerId !== null) { + // A request for health pills is already scheduled to be initiated. Clear it, and schedule a + // new request. + window.clearTimeout(this._healthPillStepRequestTimerId); + this._healthPillStepRequestTimerId = null; + } + if (this.allStepsModeEnabled) { + // This path may be slow. Schedule network requests to start some time later. If another + // request is scheduled in the mean time, drop this current request. + this._healthPillStepRequestTimerId = setTimeout( + function () { + this._healthPillStepRequestTimerId = null; + this._initiateNetworkRequestForHealthPills(requestId); + }.bind(this), + this._healthPillStepRequestTimerDelay + ); + } else { + // The user is fetching sampled steps. This path is fast, so no need to throttle. Directly + // fetch the health pills across the network. + this._initiateNetworkRequestForHealthPills(requestId); + } + } + // Initiates the network request for health pills. Do not directly call this method - network + // requests may be throttled. Instead, call _requestHealthPills, which uses this method. + _initiateNetworkRequestForHealthPills(requestId) { + if (this._healthPillRequestId !== requestId) { + // This possibly scheduled request was outdated before it was even sent across the network. Do + // not bother initiating it. + return; + } + const specificStep = this.allStepsModeEnabled + ? this.specificHealthPillStep + : undefined; + const healthPillsPromise = this._fetchHealthPills( + this._renderHierarchy.getNamesOfRenderedOps(), + specificStep + ); + const alertsPromise = this._fetchDebuggerNumericsAlerts(); + Promise.all([healthPillsPromise, alertsPromise]).then( + function (result) { + var healthPillsResult = result[0]; + var alertsResult = result[1]; + if (!this.healthPillsToggledOn) { + // The user has opted to hide health pills via the toggle button. + return; + } + if (requestId !== this._healthPillRequestId) { + // This response is no longer relevant. + return; + } + // Set the index for which step to show for the health pills. By default, show the last step. + // A precondition we assume (that Tensorboard's reservoir sampling guarantees) is that all + // node names should be mapped to the same number of steps. + for (var nodeName in healthPillsResult) { + this.set( + '_healthPillStepIndex', + healthPillsResult[nodeName].length - 1 + ); + break; + } + this.set('_debuggerNumericAlerts', alertsResult); + this.set('_nodeNamesToHealthPills', healthPillsResult); + this.set('_areHealthPillsLoading', false); + this.set('_healthPillStepRequestTimerId', null); + }.bind(this) + ); + } + _datasetsState(datasetsFetched, datasets, state) { + if (!datasetsFetched) return state === 'NOT_LOADED'; + if (!datasets || !datasets.length) return state === 'EMPTY'; + return state === 'PRESENT'; + } + protected updated( + _changedProperties: PropertyValueMap | Map + ): void { + if (_changedProperties.has('_renderHierarchy')) { + this.reload(); + } + } + _renderHierarchyChanged() { + // Reload any data on the graph when the render hierarchy (which determines which nodes are + // rendered) changes. + this.reload(); + } + _handleNodeToggleExpand() { + // Nodes were toggled. We may need to request health pills for more nodes. + this._maybeFetchHealthPills(); + } + _healthPillsToggledOnChanged(healthPillsToggledOn) { + if (healthPillsToggledOn) { + // Load health pills. + this.reload(); + } else { + // Remove all health pills by setting an empty mapping. + this.set('_nodeNamesToHealthPills', {}); + } + } + // Fetch health pills for a specific step if applicable. + _maybeFetchHealthPills() { + if (!this._shouldRequestHealthPills()) { + return; + } + this._requestHealthPills(); + } + + render() { + return html` + + + +

+ +
+

No graph definition files were found.

+

+ To store a graph, create a + tf.summary.FileWriter + and pass the graph either via the constructor, or by calling its + add_graph() method. You may want to check out the + examining the TensorFlow graph tutorial. +

+ +

+ If you’re new to using TensorBoard, and want to find out how to + add data and set up your event files, check out the + README + and perhaps the + TensorBoard tutorial. +

+ +

+ If you think TensorBoard is configured properly, please see + the section of the README devoted to missing data problems + and consider filing an issue on GitHub. +

+
+
+ +
+
+ + `; + } + + static styles = css` + :host /deep/ { + font-family: 'Roboto', sans-serif; + } + + .sidebar { + display: flex; + height: 100%; + } + + .center { + position: relative; + height: 100%; + } + + paper-dialog { + padding: 20px; + } + + .no-data-message { + max-width: 540px; + margin: 80px auto 0 auto; + } + + .graphboard { + height: 100%; + } + + .no-graph .graphboard { + display: none; + } + + .center:not(.no-graph) .no-data-message { + display: none; + } + + a { + color: var(--tb-link); + } + + a:visited { + color: var(--tb-link-visited); + } + `; +} + +declare global { + interface HTMLElementTagNameMap { + 'tb-graph-app': TbGraphApp; + } +} diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/src/vite-env.d.ts b/plugins/tensorboard-plugins/tb_graph_ascend/fe/src/vite-env.d.ts new file mode 100644 index 00000000000..11f02fe2a00 --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/src/vite-env.d.ts @@ -0,0 +1 @@ +/// diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/tsconfig.json b/plugins/tensorboard-plugins/tb_graph_ascend/fe/tsconfig.json new file mode 100644 index 00000000000..5a19e14be1d --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/tsconfig.json @@ -0,0 +1,28 @@ +{ + "compilerOptions": { + "target": "ES2020", + "experimentalDecorators": true, + "useDefineForClassFields": false, + "module": "ESNext", + "lib": [ + "ES2020", + "DOM", + "DOM.Iterable" + ], + "skipLibCheck": true, + "moduleResolution": "Node", + "resolveJsonModule": true, + "isolatedModules": true, + "moduleDetection": "force", + "noEmit": true, + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + "noImplicitAny": false, + }, + "include": [ + "src" + ] +} \ No newline at end of file diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/fe/vite.config.js b/plugins/tensorboard-plugins/tb_graph_ascend/fe/vite.config.js new file mode 100644 index 00000000000..00fc6bc05bb --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/fe/vite.config.js @@ -0,0 +1,6 @@ +import { defineConfig } from "vite"; +import { viteSingleFile } from "vite-plugin-singlefile"; + +export default defineConfig({ + plugins: [viteSingleFile()], +}); \ No newline at end of file diff --git a/plugins/tensorboard-plugins/tb_graph_ascend/setup.py b/plugins/tensorboard-plugins/tb_graph_ascend/setup.py new file mode 100644 index 00000000000..a75aec914e1 --- /dev/null +++ b/plugins/tensorboard-plugins/tb_graph_ascend/setup.py @@ -0,0 +1,79 @@ +# ------------------------------------------------------------------------- +# Copyright (c) 2024, Huawei Technologies. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#--------------------------------------------------------------------------------------------# + +import os +import setuptools + + +def read(rel_path): + here = os.path.abspath(os.path.dirname(__file__)) + with open(os.path.join(here, rel_path)) as fp: + return fp.read() + + +def get_version(rel_path): + for line in read(rel_path).splitlines(): + if line.startswith("__version__"): + delim = '"' if '"' in line else "'" + version = line.split(delim)[1] + + if os.getenv('TB_GRAPH_ASCEND_BUILD_VERSION'): + version = os.getenv('TB_GRAPH_ASCEND_BUILD_VERSION') + return version + + +INSTALL_REQUIRED = [ + "tensorboard >= 2.15.1" +] + + +setuptools.setup( + name="tb-graph-ascend", + version=get_version(os.path.join('tb_graph_ascend', '__init__.py')), + description="Model Hierarchical Visualization TensorBoard Plugin", + long_description="Model Hierarchical Visualization TensorBoard Plugin : \ + https://gitee.com/ascend/att/tree/master/plugins/tensorboard-plugins/tb_graph_ascend", + url="https://gitee.com/ascend/att/tree/master/plugins/tensorboard-plugins/tb_graph_ascend", + author="Ascend Team", + author_email="pmail_mindstudio@huawei.com", + packages=setuptools.find_packages(), + package_data={ + "tb_graph_ascend": ["static/**"], + }, + entry_points={ + "tensorboard_plugins": [ + "graph_ascend = tb_graph_ascend.plugin:GraphsPlugin", + ], + }, + python_requires=">=3.7", + install_requires=INSTALL_REQUIRED, + classifiers=[ + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python :: 3', + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Software Development', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + license='BSD-3', + keywords='pytorch tensorboard graph plugin', +) -- Gitee From 509cedc4f4bd38e406a0dd348ed28ea9345cff3a Mon Sep 17 00:00:00 2001 From: jiandaobao Date: Tue, 6 Aug 2024 10:50:46 +0800 Subject: [PATCH 27/94] Modify readme. --- debug/accuracy_tools/kj600/README.md | 19 +- .../kj600/kj600/anomaly_analyse.py | 63 ++-- .../kj600/kj600/anomaly_detect.py | 17 +- .../accuracy_tools/kj600/kj600/file_check.py | 345 ------------------ .../accuracy_tools/kj600/kj600/module_hook.py | 29 +- .../kj600/kj600/module_metric.py | 2 +- 6 files changed, 73 insertions(+), 402 deletions(-) delete mode 100644 debug/accuracy_tools/kj600/kj600/file_check.py diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index 6acf189bbae..02f7691454c 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -55,7 +55,7 @@ from kj600.module_hook import TrainerMon model, optimizer, opt_param_scheduler = setup_model_and_optimizer( model_provider, model_type) # 模型初始化后插入工具代码 -hooker = TrainerMon("./monitor_config.json", params_have_main_grad=True) +hooker = TrainerMon("./monitor_config.json", process_group=None, params_have_main_grad=True) hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size) ``` @@ -75,7 +75,19 @@ hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=args.global_batch_size/ "eps": 1e-8 } ``` - +### 梯度异常时序判断 +1. 进入工具路径启动脚本: +```shell +cd kj600/kj600 +python3 anomaly_analyse.py -d $KJ600_OUTPUT_DIR/anomaly_detected +``` +支持以下参数配置 +| 字段名字 | 解释 | 是否必选释 | +| ------ | -------- | -------- | +|-d 或 --data_path| 指定梯度异常落盘文件夹,梯度监控功能输出,一般为$KJ600_OUTPUT_DIR/anomaly_detected。|是 | +|-o 或 --out_path| 排序后的异常落盘文件地址,默认在--data_path路径下落盘一个anomaly_analyse.json文件| 否 | +|-k 或 --topk| 指定保留前topk个异常,默认为8| 否 | +|-s 或 --step_list| 指定分析的step范围,默认为[]| 否 | ## 详细配置 @@ -158,7 +170,7 @@ hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=args.global_batch_size/ ``` from kj600.module_hook import TrainerMon - hooker = TrainerMon("./llama2_config.json", params_have_main_grad=True, opt_ty="Megatron_DistributedOptimizer") # or opt_ty=Megatron_Float16OptimizerWithFloat16Params + hooker = TrainerMon("./llama2_config.json", process_group=None, params_have_main_grad=True, opt_ty="Megatron_DistributedOptimizer") # or opt_ty=Megatron_Float16OptimizerWithFloat16Params hooker.hook_modules(model=model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size) ``` params_have_main_grad: 若为True则参数权重梯度为main_grad,否则为grad,默认为True。 @@ -214,6 +226,7 @@ TrainerMon.__init__(config_file_path, params_have_main_grad=True, opt_ty=None) - | 参数 | 说明 | 是否必选 | | ----- | -------------------- | -------- | | config_file_path |自己写的json配置文件路径。 | 是 | +| process_group | 传入ProcessGroup对象,用以确定pipeline并行不同rank异常间时序,megatron下通过core.parallel_state.get_pipeline_model_parallel_group()获得 | 否 | | params_have_main_grad |权重是否使用main_grad,是就为True,否则为False。默认为True。 | 否 | | opt_ty |优化器类型,有两个选项,Megatron_DistributedOptimizer:使用bf16或者fp16混合精度时开启分布式优化器;Megatron_Float16OptimizerWithFloat16Params:使用bf16或者fp16混合精度选项并且不开启分布式优化器,也适用于常规的adam优化器。如果使用的不是adam优化器,使用None。默认为None。 | 否 | diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py index 119e35ca0e8..058ecf81b96 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py @@ -23,7 +23,7 @@ import os from pathlib import Path import sys -from kj600.utils import print_info_log +from kj600.utils import print_info_log, print_warn_log from kj600.anomaly_detect import GradAnomalyData from kj600.file_check import ( change_mode, @@ -38,12 +38,27 @@ class AnomalyDataWriter: """ 异常数据写入类,负责将异常数据写入到JSON文件中。 """ - def __init__(self, dump_path, rank) -> None: self.dump_path = dump_path self.dump_rank_dir = os.path.join(self.dump_path, f"rank{rank}") self.json_path = os.path.join(self.dump_rank_dir, "anomaly.json") + @staticmethod + def get_anomaly_dict(anomalies): + """将GradAnomalyData列表转换为json + """ + anomalies_json = {} + for anomaly in anomalies: + anomalies_json.update({anomaly.get_key(): anomaly.to_dict()}) + return anomalies_json + + @staticmethod + def write_data_in_single_json(json_path, anomalies_data): + with FileOpen(json_path, "w+") as f: + fcntl.flock(f, fcntl.LOCK_EX) + json.dump(anomalies_data, f, indent=1) + fcntl.flock(f, fcntl.LOCK_UN) + def init_detected_json(self): """初始化落盘文件""" check_path_before_create(self.dump_path) @@ -81,28 +96,23 @@ class AnomalyDataWriter: data_to_write = {} data_to_write.update(anomalies_json) self.write_data_in_single_json(self.json_path, data_to_write) - - @staticmethod - def get_anomaly_dict(anomalies): - """将GradAnomalyData列表转换为json - """ - anomalies_json = {} - for anomaly in anomalies: - anomalies_json.update({anomaly.get_key(): anomaly.to_dict()}) - return anomalies_json - - @staticmethod - def write_data_in_single_json(json_path, anomalies_data): - with FileOpen(json_path, "w+") as f: - fcntl.flock(f, fcntl.LOCK_EX) - json.dump(anomalies_data, f, indent=1) - fcntl.flock(f, fcntl.LOCK_UN) - class AnomalyDataLoader: def __init__(self, data_path) -> None: self.data_path = data_path + @staticmethod + def create_instances_from_dict(anomalies_dict: dict): + instances = [] + for values in anomalies_dict.values(): + try: + instances.append(GradAnomalyData(**values)) + except KeyError as e: + print_warn_log(f"Missing key in anomaly data: {e}") + except ValueError as e: + print_warn_log(f"Value error when creating a GradAnomalyData instance: {e}") + return instances + def get_anomalies_from_jsons(self): """遍历文件夹,从rankK/anomaly.json中读取异常数据 return: anomalies: GradAnomalyData对象列表 @@ -125,17 +135,6 @@ class AnomalyDataLoader: instances = self.create_instances_from_dict(data_anomalies) anomalies.extend(instances) return anomalies - - def create_instances_from_dict(self, anomalies_dict: dict): - instances = [] - for values in anomalies_dict.values(): - try: - instances.append(GradAnomalyData(**values)) - except KeyError as e: - print_info_log(f"Missing key in anomaly data: {e}") - except ValueError as e: - print_info_log(f"Value error when creating a GradAnomalyData instance: {e}") - return instances class AnomalyAnalyse: def __init__(self) -> None: @@ -148,7 +147,7 @@ class AnomalyAnalyse: if not step_list: filtered_anomalies = anomalies else: - filtered_anomalies = [anomaly for anomaly in anomalies if anomaly.step in step_list] + filtered_anomalies = [anomaly for anomaly in anomalies if str(anomaly.step) in step_list] if topk >= len(filtered_anomalies): self.sorted_anomalies = sorted(filtered_anomalies) else: @@ -198,4 +197,4 @@ def _anomaly_analyse(): if __name__=='__main__': _anomaly_analyse() - print_info_log("Analyse task completed.") \ No newline at end of file + print_info_log("Analyse task completed.") diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index bc066a19a13..16771a50fd1 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -2,10 +2,10 @@ import statistics as st from abc import ABC from typing import List import sys +from dataclasses import dataclass, field from torch.utils.tensorboard import SummaryWriter from collections import defaultdict from kj600.utils import print_info_log -from dataclasses import dataclass, field class ScanRule(ABC): def apply(self, history, cur): @@ -129,7 +129,8 @@ class GradAnomalyData: return self.__dict__ def get_key(self): - return self.tag_name + "_call_" + str(self.call_id) + return ''.join( + (str(self.tag_name), "_step_", str(self.step), "_call_" , str(self.call_id))) class SummaryWriterWithAD(SummaryWriter): def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None): @@ -157,11 +158,15 @@ class SummaryWriterWithAD(SummaryWriter): if self.anomaly_factory: self.anomalies.append(self.anomaly_factory.create(tag, exception_message, global_step)) return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) - - def _ad(self, scalar_value, history): - return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) def get_anomalies(self): """返回已检测到的异常列表 """ - return self.anomalies \ No newline at end of file + return self.anomalies + + def clear_anomalies(self): + self.anomalies.clear() + + def _ad(self, scalar_value, history): + return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) + \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/file_check.py b/debug/accuracy_tools/kj600/kj600/file_check.py deleted file mode 100644 index 6adcf42b0c6..00000000000 --- a/debug/accuracy_tools/kj600/kj600/file_check.py +++ /dev/null @@ -1,345 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -import os -import re - -from kj600.utils import print_info_log - - -class CodedException(Exception): - def __init__(self, code, error_info=""): - super().__init__() - self.code = code - self.error_info = self.err_strs.get(code) + error_info - - def __str__(self): - return self.error_info - - -class FileCheckException(CodedException): - INVALID_FILE_ERROR = 0 - FILE_PERMISSION_ERROR = 1 - SOFT_LINK_ERROR = 2 - ILLEGAL_PATH_ERROR = 3 - ILLEGAL_PARAM_ERROR = 4 - FILE_TOO_LARGE_ERROR = 5 - - err_strs = { - SOFT_LINK_ERROR: "[kj600] 检测到软链接: ", - FILE_PERMISSION_ERROR: "[kj600] 文件权限错误: ", - INVALID_FILE_ERROR: "[kj600] 无效文件: ", - ILLEGAL_PATH_ERROR: "[kj600] 非法文件路径: ", - ILLEGAL_PARAM_ERROR: "[kj600] 非法打开方式: ", - FILE_TOO_LARGE_ERROR: "[kj600] 文件过大: ", - } - - -class FileCheckConst: - """ - Class for file check const - """ - - READ_ABLE = "read" - WRITE_ABLE = "write" - READ_WRITE_ABLE = "read and write" - DIRECTORY_LENGTH = 4096 - FILE_NAME_LENGTH = 255 - FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$" - FILE_PATTERN = r"^[a-zA-Z0-9_./-]+$" - JSON_SUFFIX = ".json" - MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 - DIR = "dir" - FILE = "file" - DATA_DIR_AUTHORITY = 0o750 - DATA_FILE_AUTHORITY = 0o640 - FILE_SIZE_DICT = { - JSON_SUFFIX: MAX_JSON_SIZE, - } - - -class FileChecker: - """ - The class for check file. - - Attributes: - file_path: The file or dictionary path to be verified. - path_type: file or dictionary - ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability - file_type(str): The correct file type for file - """ - - def __init__( - self, file_path, path_type, ability=None, file_type=None, is_script=True - ): - self.file_path = file_path - self.path_type = self._check_path_type(path_type) - self.ability = ability - self.file_type = file_type - self.is_script = is_script - - @staticmethod - def _check_path_type(path_type): - if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]: - print_info_log( - f"The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}." - ) - raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) - return path_type - - def common_check(self): - """ - 功能:用户校验基本文件权限:软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符 - 注意:文件后缀的合法性,非通用操作,可使用其他独立接口实现 - """ - check_path_exists(self.file_path) - check_link(self.file_path) - self.file_path = os.path.realpath(self.file_path) - check_path_length(self.file_path) - check_path_type(self.file_path, self.path_type) - self.check_path_ability() - if self.is_script: - check_path_owner_consistent(self.file_path) - check_path_pattern_vaild(self.file_path) - check_common_file_size(self.file_path) - check_file_suffix(self.file_path, self.file_type) - return self.file_path - - def check_path_ability(self): - if self.ability == FileCheckConst.WRITE_ABLE: - check_path_writability(self.file_path) - if self.ability == FileCheckConst.READ_ABLE: - check_path_readability(self.file_path) - if self.ability == FileCheckConst.READ_WRITE_ABLE: - check_path_readability(self.file_path) - check_path_writability(self.file_path) - - -class FileOpen: - """ - The class for open file by a safe way. - - Attributes: - file_path: The file or dictionary path to be opened. - mode(str): The file open mode - """ - - SUPPORT_READ_MODE = ["r", "rb"] - SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"] - SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"] - - def __init__(self, file_path, mode, encoding="utf-8"): - self.file_path = file_path - self.mode = mode - self.encoding = encoding - self._handle = None - - def __enter__(self): - self.check_file_path() - binary_mode = "b" - if binary_mode not in self.mode: - self._handle = open(self.file_path, self.mode, encoding=self.encoding) - else: - self._handle = open(self.file_path, self.mode) - return self._handle - - def __exit__(self, exc_type, exc_val, exc_tb): - if self._handle: - self._handle.close() - - def check_file_path(self): - support_mode = ( - self.SUPPORT_READ_MODE - + self.SUPPORT_WRITE_MODE - + self.SUPPORT_READ_WRITE_MODE - ) - if self.mode not in support_mode: - print_info_log("File open not support %s mode" % self.mode) - raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) - check_link(self.file_path) - self.file_path = os.path.realpath(self.file_path) - check_path_length(self.file_path) - self.check_ability_and_owner() - check_path_pattern_vaild(self.file_path) - if os.path.exists(self.file_path): - check_common_file_size(self.file_path) - - def check_ability_and_owner(self): - if self.mode in self.SUPPORT_READ_MODE: - check_path_exists(self.file_path) - check_path_readability(self.file_path) - check_path_owner_consistent(self.file_path) - if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path): - check_path_writability(self.file_path) - check_path_owner_consistent(self.file_path) - if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path): - check_path_readability(self.file_path) - check_path_writability(self.file_path) - check_path_owner_consistent(self.file_path) - - -def check_link(path): - abs_path = os.path.abspath(path) - if os.path.islink(abs_path): - print_info_log("The file path {} is a soft link.".format(path)) - raise FileCheckException(FileCheckException.SOFT_LINK_ERROR) - - -def check_path_length(path, name_length=None): - file_max_name_length = ( - name_length if name_length else FileCheckConst.FILE_NAME_LENGTH - ) - if ( - len(path) > FileCheckConst.DIRECTORY_LENGTH - or len(os.path.basename(path)) > file_max_name_length - ): - print_info_log("The file path length exceeds limit.") - raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) - - -def check_path_exists(path): - if not os.path.exists(path): - print_info_log("The file path %s does not exist." % path) - raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) - - -def check_path_readability(path): - if not os.access(path, os.R_OK): - print_info_log("The file path %s is not readable." % path) - raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) - - -def check_path_writability(path): - if not os.access(path, os.W_OK): - print_info_log("The file path %s is not writable." % path) - raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) - - -def check_path_executable(path): - if not os.access(path, os.X_OK): - print_info_log("The file path %s is not executable." % path) - raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) - - -def check_other_user_writable(path): - st = os.stat(path) - if st.st_mode & 0o002: - print_info_log( - "The file path %s may be insecure because other users have write permissions. " - % path - ) - raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) - - -def check_path_owner_consistent(path): - file_owner = os.stat(path).st_uid - if file_owner != os.getuid(): - print_info_log( - "The file path %s may be insecure because is does not belong to you." % path - ) - raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) - - -def check_path_pattern_vaild(path): - if not re.match(FileCheckConst.FILE_VALID_PATTERN, path): - print_info_log("The file path %s contains special characters." % (path)) - raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) - - -def check_file_size(file_path, max_size): - file_size = os.path.getsize(file_path) - if file_size >= max_size: - print_info_log(f"The size of file path {file_path} exceeds {max_size} bytes.") - raise FileCheckException(FileCheckException.FILE_TOO_LARGE_ERROR) - - -def check_common_file_size(file_path): - if os.path.isfile(file_path): - for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): - if file_path.endswith(suffix): - check_file_size(file_path, max_size) - break - - -def check_file_suffix(file_path, file_suffix): - if file_suffix: - if not file_path.endswith(file_suffix): - print_info_log(f"The {file_path} should be a {file_suffix} file!") - raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) - - -def check_path_type(file_path, file_type): - if file_type == FileCheckConst.FILE: - if not os.path.isfile(file_path): - print_info_log(f"The {file_path} should be a file!") - raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) - if file_type == FileCheckConst.DIR: - if not os.path.isdir(file_path): - print_info_log(f"The {file_path} should be a dictionary!") - raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) - - -def create_directory(dir_path): - """ - Function Description: - creating a directory with specified permissions - Parameter: - dir_path: directory path - Exception Description: - when invalid data throw exception - """ - dir_path = os.path.realpath(dir_path) - try: - os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) - except OSError as ex: - raise FileCheckException( - FileCheckException.ILLEGAL_PATH_ERROR, - "Failed to create {}. Please check the path permission or disk space .{}".format( - dir_path, str(ex) - ), - ) from ex - - -def check_path_before_create(path): - if path_len_exceeds_limit(path): - raise FileCheckException( - FileCheckException.ILLEGAL_PATH_ERROR, "The file path length exceeds limit." - ) - - if not re.match(FileCheckConst.FILE_PATTERN, os.path.realpath(path)): - raise FileCheckException( - FileCheckException.ILLEGAL_PATH_ERROR, - "The file path {} contains special characters.".format(path), - ) - - -def change_mode(path, mode): - if not os.path.exists(path) or os.path.islink(path): - return - try: - os.chmod(path, mode) - except PermissionError as ex: - raise FileCheckException( - FileCheckException.FILE_PERMISSION_ERROR, - "Failed to change {} authority. {}".format(path, str(ex)), - ) from ex - - -def path_len_exceeds_limit(file_path): - return ( - len(os.path.realpath(file_path)) > FileCheckConst.DIRECTORY_LENGTH - or len(os.path.basename(file_path)) > FileCheckConst.FILE_NAME_LENGTH - ) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 1e91c5a0a93..9e11a337a06 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -399,6 +399,7 @@ class TrainerMon: context.metric_list.clear() context.step += 1 self.anomaly_data_writer.write_detected_json(self.summary_writer.get_anomalies()) + self.summary_writer.clear_anomalies() self.call_id = 0 return if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list): @@ -521,7 +522,7 @@ class TrainerMon: def register_hooks(model_chunk, vpp_stage=None): for param_name, param in model_chunk.named_parameters(): - prefix = "" if vpp_stage is None else f"{vpp_stage}_" + prefix = "" if not self.vpp else f"vpp{vpp_stage}_" name = prefix + param_name for target in self.config['targets'].keys(): context = self.grad_context[name] @@ -532,21 +533,19 @@ class TrainerMon: param.register_hook(partial(param_hook, context=context)) context.grad_acc = torch.zeros_like(param).to(DEVICE) + model = [model] if not isinstance(model, list) else model + if len(model) > 1: + self.vpp = True + self._smallest_rank_print('vpp enabled') if self.print_struct: - self.module_struct = { - module_name: 1. for module_name, module in model.named_modules()} + for vpp_stage, model_chunk in enumerate(model): + prefix = "" if not self.vpp else f"vpp{vpp_stage}_" + self.module_struct = { + prefix + f"{module_name}": {} for module_name, _ in model_chunk.named_modules()} return - - if isinstance(model, list): - if len(model) > 1: - self.vpp = True - self._smallest_rank_print('vpp enabled') - for index, model_chunk in enumerate(model): - vpp_stage = index if self.vpp else 0 - register_hooks(model_chunk, vpp_stage=vpp_stage) - - else: - register_hooks(model) - \ No newline at end of file + for index, model_chunk in enumerate(model): + vpp_stage = index if self.vpp else 0 + register_hooks(model_chunk, vpp_stage=vpp_stage) + \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 9415fdafcfe..3424e4cc7c6 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -8,7 +8,7 @@ def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): if rank is None: return f"{module_or_param_name}/{tag}" else: - return f"{module_or_param_name}/{rank}/{tag}" + return f"{module_or_param_name}/rank{rank}/{tag}" # 用于存储所有metric实现类的注册表 -- Gitee From 7e51f29b345d2623b214970ffa79b2606af93c29 Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 6 Aug 2024 09:34:05 +0000 Subject: [PATCH 28/94] all actv grad in one context --- .../accuracy_tools/kj600/kj600/module_hook.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 370cffc2b0c..3c1b5492cc7 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -6,6 +6,7 @@ from functools import partial from datetime import datetime import torch import torch.distributed as dist +from torch import Stream from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook from kj600.module_spec_verifier import get_config, validate_config_spec from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon @@ -85,6 +86,7 @@ class GradContext: self.post = [] self.acc_metric = [] self.acc = {} + self.actv = [] def reset(self): self.pre.clear() @@ -182,6 +184,7 @@ class TrainerMon: self.optimizer_hooked = False self.param2name = defaultdict(str) + self.metric_stream = torch.cuda.Stream() self.mix_precision_optimizer_mon = OptimizerMonFactory.create_optimizer_mon(opt_ty) if opt_ty is None: @@ -298,20 +301,23 @@ class TrainerMon: if not self.xy_distribution: return for _, fwd_context in self.module_fwd_hook_context_by_module.items(): - if len(bwd_context.actv) == 0: + if len(fwd_context.actv) == 0: continue if not len(fwd_context.actv) == self.micro_batch_number: print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv') fwd_context.actv.clear() - for _, bwd_context in self.module_bwd_hook_context_by_module.items(): - if len(bwd_context.actvgrad) == 0: - continue - if not len(bwd_context.actvgrad) == self.micro_batch_number: - print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") - self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') - bwd_context.actvgrad.clear() + # for _, bwd_context in self.module_bwd_hook_context_by_module.items(): + # if len(bwd_context.actvgrad) == 0: + # continue + # if not len(bwd_context.actvgrad) == self.micro_batch_number: + # print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") + # self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') + # bwd_context.actvgrad.clear() + + self.write_metrics(self.ops, self.summary_writer, self.grad_context.actv, step, 'grad_actv') + self.grad_context.actv.clear() def write_grad_tb(self, step): if not self.wg_distribution: @@ -525,8 +531,13 @@ class TrainerMon: if context.micro_step == 0 and context.actvgrad: print_warn_log(f"actvgrad context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.") context.actvgrad.clear() - context.actvgrad.append(metric_dict) + # context.actvgrad.append(metric_dict) + if len(self.grad_context.actv) == context.micro_step: + self.grad_context.actv.append({metric_name:{} for metric_name in self.ops}) + for metric_name in self.ops: + self.grad_context.actv[context.micro_step][metric_name].update(metric_dict[metric_name]) + context.micro_step += 1 if context.micro_step == self.micro_batch_number: context.micro_step = 0 -- Gitee From 94613ee7c34e49314243465b208358c0ae151a4e Mon Sep 17 00:00:00 2001 From: louyujing <7927276+louyujing@user.noreply.gitee.com> Date: Tue, 6 Aug 2024 12:00:34 +0000 Subject: [PATCH 29/94] =?UTF-8?q?!1393=20=E5=90=8C=E6=AD=A5master=E7=9A=84?= =?UTF-8?q?msprobe=E5=88=B0poc=20Merge=20pull=20request=20!1393=20from=20l?= =?UTF-8?q?ouyujing/poc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../accuracy_tools/atat/mindspore/__init__.py | 1 - .../pytorch_ut/compare/test_acc_compare.py | 17 - .../atat/test/pytorch_ut/test_pt_config.py | 38 - .../{atat => msprobe}/README.md | 94 +- .../{atat => msprobe}/__init__.py | 0 .../{atat => msprobe}/config/README.md | 62 +- .../{atat => msprobe}/config/config.json | 5 + .../config/img/free_benchmark.png | Bin .../{atat => msprobe}/core/common/const.py | 36 +- .../core/common/exceptions.py | 37 +- .../core/common/file_check.py | 6 +- .../{atat => msprobe}/core/common/log.py | 0 .../{atat => msprobe}/core/common/utils.py | 17 +- .../{atat => msprobe}/core/common_config.py | 47 +- .../core/data_dump/data_collector.py | 37 +- .../core/data_dump/data_processor/base.py | 80 +- .../core/data_dump/data_processor/factory.py | 14 +- .../data_processor/mindspore_processor.py | 206 ++++ .../data_processor/pytorch_processor.py | 96 +- .../core/data_dump/json_writer.py | 26 +- .../{atat => msprobe}/core/data_dump/scope.py | 4 +- .../msprobe/mindspore/__init__.py | 1 + .../msprobe/mindspore/common/log.py | 38 + .../msprobe/mindspore/common/utils.py | 44 + .../mindspore/debugger/__init__.py | 0 .../mindspore/debugger/debugger_config.py | 21 +- .../mindspore/debugger/precision_debugger.py | 55 +- .../{atat => msprobe}/mindspore/doc/dump.md | 8 +- .../mindspore/dump/__init__.py | 0 .../mindspore/dump/api_kbk_dump.py | 8 +- .../mindspore/dump/dump_tool_factory.py | 6 +- .../mindspore/dump/hook_cell/api_registry.py | 104 ++ .../mindspore/dump/hook_cell/hook_cell.py | 53 + .../dump/hook_cell/support_wrap_ops.yaml | 925 ++++++++++++++++++ .../dump/hook_cell/wrap_functional.py | 94 ++ .../mindspore/dump/hook_cell/wrap_tensor.py | 66 ++ .../mindspore/dump/kernel_graph_dump.py | 8 +- .../{atat => msprobe}/mindspore/ms_config.py | 39 +- .../mindspore/overflow_check/__init__.py | 0 .../kernel_graph_overflow_check.py | 8 +- .../overflow_check_tool_factory.py | 4 +- .../msprobe/mindspore/service.py | 152 +++ .../mindspore/task_handler_factory.py | 6 +- .../{atat/atat.py => msprobe/msprobe.py} | 12 +- .../{atat => msprobe}/pytorch/__init__.py | 0 .../pytorch/advisor/advisor.py | 12 +- .../pytorch/advisor/advisor_const.py | 0 .../pytorch/advisor/advisor_result.py | 8 +- .../pytorch/api_accuracy_checker/.keep | 0 .../pytorch/api_accuracy_checker/__init__.py | 0 .../pytorch/api_accuracy_checker/common/.keep | 0 .../api_accuracy_checker/common/__init__.py | 0 .../api_accuracy_checker/common/config.py | 37 +- .../api_accuracy_checker/common/utils.py | 9 +- .../api_accuracy_checker/compare/__init__.py | 0 .../api_accuracy_checker/compare/algorithm.py | 4 +- .../compare/api_precision_compare.py | 18 +- .../compare/api_precision_standard.yaml | 0 .../compare/api_precision_threshold.yaml | 0 .../api_accuracy_checker/compare/compare.py | 14 +- .../compare/compare_column.py | 2 +- .../compare/compare_utils.py | 8 +- .../pytorch/api_accuracy_checker/config.yaml | 1 + .../pytorch/api_accuracy_checker/run_ut/.keep | 0 .../api_accuracy_checker/run_ut/__init__.py | 0 .../run_ut/data_generate.py | 15 +- .../run_ut/multi_run_ut.py | 31 +- .../run_ut/run_overflow_check.py | 24 +- .../api_accuracy_checker/run_ut/run_ut.py | 97 +- .../run_ut/run_ut_utils.py | 0 .../run_ut/torch_ut_setting.json | 0 .../pytorch/bench_functions/__init__.py | 15 + .../pytorch/bench_functions/apply_adam_w.py | 28 + .../bench_functions/confusion_transpose.py | 19 + .../pytorch/bench_functions/fast_gelu.py | 55 ++ .../bench_functions/layer_norm_eval.py | 6 + .../msprobe/pytorch/bench_functions/linear.py | 12 + .../bench_functions/matmul_backward.py | 48 + .../bench_functions/npu_fusion_attention.py | 421 ++++++++ .../pytorch/bench_functions/rms_norm.py | 15 + .../pytorch/bench_functions/rotary_mul.py | 52 + .../bench_functions/scaled_mask_softmax.py | 26 + .../msprobe/pytorch/bench_functions/swiglu.py | 55 ++ .../pytorch/common/__init__.py | 0 .../pytorch/common/compare_script.template | 0 .../{atat => msprobe}/pytorch/common/log.py | 6 +- .../pytorch/common/parse_json.py | 6 +- .../{atat => msprobe}/pytorch/common/utils.py | 38 +- .../pytorch/compare/acc_compare.py | 67 +- .../pytorch/compare/distributed_compare.py | 24 +- .../pytorch/compare/highlight.py | 4 +- .../pytorch/compare/mapping.yaml | 0 .../pytorch/compare/match.py | 4 +- .../pytorch/compare/npy_compare.py | 6 +- .../pytorch/debugger/__init__.py | 0 .../pytorch/debugger/debugger_config.py | 13 +- .../pytorch/debugger/precision_debugger.py | 37 +- .../{atat => msprobe}/pytorch/doc/FAQ.md | 30 +- .../pytorch/doc/api_accuracy_checker.md | 80 +- .../{atat => msprobe}/pytorch/doc/dump.md | 97 +- .../pytorch/doc/img/BLOOM-7B_1.png | Bin .../pytorch/doc/img/BLOOM-7B_2.png | Bin .../pytorch/doc/img/BLOOM-7B_3.png | Bin .../pytorch/doc/img/BLOOM-7B_4.png | Bin .../pytorch/doc/img/GPT-3_1.png | Bin .../pytorch/doc/img/GPT-3_2.png | Bin .../pytorch/doc/img/GPT-3_3.png | Bin .../pytorch/doc/img/GPT-3_4.png | Bin .../pytorch/doc/img/GPT-3_5.png | Bin .../pytorch/doc/img/GPT-3_6.png | Bin .../pytorch/doc/img/GPT-3_7.png | Bin .../pytorch/doc/img/GPT-3_8.png | Bin .../pytorch/doc/img/YOLOV5S_1.png | Bin .../pytorch/doc/img/YOLOV5S_2.png | Bin .../doc/img/accuracy_checking_details.png | Bin .../doc/img/accuracy_checking_result.png | Bin .../doc/img/api_precision_compare_details.png | Bin .../doc/img/api_precision_compare_result.png | Bin .../pytorch/doc/img/auto_analyze_log.png | Bin .../pytorch/doc/img/compare_result_pkl.png | Bin .../doc/img/compare_result_pkl_md5.png.png | Bin .../pytorch/doc/img/cpu_info.png | Bin .../pytorch/doc/img/module_compare.png | Bin ...72\347\272\277\346\212\245\345\221\212.md" | 4 +- .../pytorch/doc/parse_tool.md | 6 +- .../pytorch/doc/ptdbg_ascend_compare.md | 6 +- .../pytorch/doc/ptdbg_ascend_overview.md | 10 +- .../pytorch/doc/ptdbg_ascend_quickstart.md | 24 +- .../pytorch/doc/run_overflow_check.md | 2 +- ...76\345\272\246\346\257\224\345\257\271.md" | 8 +- .../pytorch/free_benchmark/__init__.py | 6 +- .../pytorch/free_benchmark/common/__init__.py | 0 .../pytorch/free_benchmark/common/constant.py | 7 +- .../pytorch/free_benchmark/common/counter.py | 2 +- .../pytorch/free_benchmark/common/enums.py | 0 .../pytorch/free_benchmark/common/params.py | 8 +- .../pytorch/free_benchmark/common/utils.py | 6 +- .../free_benchmark/compare/grad_saver.py | 20 +- .../compare/single_benchmark.py | 22 +- .../pytorch/free_benchmark/main.py | 20 +- .../perturbed_layers/__init__.py | 0 .../perturbed_layers/base_layer.py | 2 +- .../perturbed_layers/layer_factory.py | 16 +- .../perturbed_layers/npu/__init__.py | 0 .../perturbed_layers/npu/add_noise.py | 22 +- .../perturbed_layers/npu/bit_noise.py | 22 +- .../perturbed_layers/npu/change_value.py | 14 +- .../perturbed_layers/npu/improve_precision.py | 14 +- .../perturbed_layers/npu/no_change.py | 10 +- .../perturbed_layers/npu/npu_base_layser.py | 4 +- .../perturbed_layers/run_cpu.py | 12 +- .../result_handlers/__init__.py | 0 .../result_handlers/base_handler.py | 90 +- .../result_handlers/check_handler.py | 14 +- .../result_handlers/fix_handler.py | 10 +- .../result_handlers/handler_factory.py | 14 +- .../result_handlers/preheat_handler.py | 22 +- .../msprobe/pytorch/function_factory.py | 75 ++ .../pytorch/functional/__init__.py | 0 .../pytorch/functional/data_processor.py | 0 .../pytorch/functional/dump_module.py | 16 +- .../pytorch/hook_module/__init__.py | 0 .../pytorch/hook_module/api_registry.py | 18 +- .../pytorch/hook_module/hook_module.py | 8 +- .../pytorch/hook_module/support_wrap_ops.yaml | 3 +- .../pytorch/hook_module/utils.py | 2 +- .../pytorch/hook_module/wrap_aten.py | 35 +- .../pytorch/hook_module/wrap_distributed.py | 8 +- .../pytorch/hook_module/wrap_functional.py | 10 +- .../pytorch/hook_module/wrap_npu_custom.py | 28 +- .../pytorch/hook_module/wrap_tensor.py | 8 +- .../pytorch/hook_module/wrap_torch.py | 8 +- .../pytorch/hook_module/wrap_vf.py | 8 +- .../pytorch/module_processer.py | 43 +- .../pytorch/online_dispatch/__init__.py | 0 .../pytorch/online_dispatch/compare.py | 10 +- .../pytorch/online_dispatch/dispatch.py | 10 +- .../pytorch/online_dispatch/dump_compare.py | 6 +- .../pytorch/online_dispatch/single_compare.py | 0 .../online_dispatch/torch_ops_config.yaml | 0 .../pytorch/online_dispatch/utils.py | 4 +- .../{atat => msprobe}/pytorch/parse.py | 2 +- .../pytorch/parse_tool/__init__.py | 0 .../pytorch/parse_tool/cli.py | 4 +- .../pytorch/parse_tool/lib/__init__.py | 0 .../pytorch/parse_tool/lib/compare.py | 33 +- .../pytorch/parse_tool/lib/config.py | 2 +- .../pytorch/parse_tool/lib/file_desc.py | 0 .../pytorch/parse_tool/lib/interactive_cli.py | 8 +- .../pytorch/parse_tool/lib/parse_exception.py | 2 +- .../pytorch/parse_tool/lib/parse_tool.py | 10 +- .../pytorch/parse_tool/lib/utils.py | 42 +- .../pytorch/parse_tool/lib/visualization.py | 8 +- .../{atat => msprobe}/pytorch/pt_config.py | 53 +- .../{atat => msprobe}/pytorch/service.py | 44 +- .../test/core_ut/common}/test_utils.py | 40 +- .../core_ut/data_dump/test_data_collector.py | 47 + .../core_ut/data_dump/test_json_writer.py | 6 +- .../test/core_ut/data_dump/test_scope.py | 151 +++ .../test/core_ut/test_common_config.py | 30 +- .../test/core_ut/test_file_check.py | 42 +- .../test/core_ut/test_log.py | 10 +- .../test/mindspore_ut/test_api_kbk_dump.py | 14 +- .../test/mindspore_ut/test_debugger_config.py | 8 +- .../mindspore_ut/test_dump_tool_factory.py | 6 +- .../mindspore_ut/test_kernel_graph_dump.py | 14 +- .../test_kernel_graph_overflow_check.py | 14 +- .../test/mindspore_ut/test_ms_config.py | 12 +- .../test_overflow_check_tool_factory.py | 6 +- .../mindspore_ut/test_precision_debugger.py | 12 +- .../mindspore_ut/test_task_handler_factory.py | 10 +- .../test/pytorch_ut/advisor/test_advisor.py | 15 +- .../common/test_common_utils.py | 4 +- .../common/test_config.py | 4 +- .../compare/test_algorithm.py | 2 +- .../compare/test_api_precision_compare.py | 4 +- .../compare/test_compare.py | 6 +- .../compare/test_compare_column.py | 2 +- .../compare/test_compare_utils.py | 4 +- .../api_accuracy_checker/run_ut/dump.json | 0 .../api_accuracy_checker/run_ut/forward.json | 0 .../run_ut/test_data_generate.py | 4 +- .../run_ut/test_multi_run_ut.py | 20 +- .../run_ut/test_run_ut.py | 4 +- .../pytorch_ut/compare/test_acc_compare.py | 267 +++++ .../test/pytorch_ut/compare/test_match.py | 20 + .../perturbed_layers/test_perturbed_layser.py | 8 +- .../result_handlers/test_result_handler.py | 12 +- .../pytorch_ut/free_benchmark/test_main.py | 8 +- .../pytorch_ut/functional/test_dump_module.py | 15 + .../hook_module/test_api_registry.py | 6 +- .../hook_module/test_hook_module.py | 2 +- .../pytorch_ut/hook_module/test_wrap_aten.py | 2 +- .../hook_module/test_wrap_distributed.py | 2 +- .../hook_module/test_wrap_functional.py | 2 +- .../hook_module/test_wrap_tensor.py | 2 +- .../pytorch_ut/hook_module/test_wrap_torch.py | 2 +- .../pytorch_ut/hook_module/test_wrap_vf.py | 2 +- .../msprobe/test/pytorch_ut/test_pt_config.py | 84 ++ .../msprobe/test/pytorch_ut/test_service.py | 59 ++ .../test/resources/advisor.txt | 0 .../compare_result_20230703104808.csv | 0 .../compare_result_without_accuracy.csv | 0 .../test/resources/config.yaml | 0 .../test/resources/npu_test.pkl | 0 .../{atat => msprobe}/test/run_test.sh | 0 .../{atat => msprobe}/test/run_ut.py | 2 +- .../test/test_module_processer.py | 4 +- 248 files changed, 4732 insertions(+), 1102 deletions(-) delete mode 100644 debug/accuracy_tools/atat/mindspore/__init__.py delete mode 100644 debug/accuracy_tools/atat/test/pytorch_ut/compare/test_acc_compare.py delete mode 100644 debug/accuracy_tools/atat/test/pytorch_ut/test_pt_config.py rename debug/accuracy_tools/{atat => msprobe}/README.md (64%) rename debug/accuracy_tools/{atat => msprobe}/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/config/README.md (89%) rename debug/accuracy_tools/{atat => msprobe}/config/config.json (84%) rename debug/accuracy_tools/{atat => msprobe}/config/img/free_benchmark.png (100%) rename debug/accuracy_tools/{atat => msprobe}/core/common/const.py (91%) rename debug/accuracy_tools/{atat => msprobe}/core/common/exceptions.py (52%) rename debug/accuracy_tools/{atat => msprobe}/core/common/file_check.py (98%) rename debug/accuracy_tools/{atat => msprobe}/core/common/log.py (100%) rename debug/accuracy_tools/{atat => msprobe}/core/common/utils.py (97%) rename debug/accuracy_tools/{atat => msprobe}/core/common_config.py (54%) rename debug/accuracy_tools/{atat => msprobe}/core/data_dump/data_collector.py (85%) rename debug/accuracy_tools/{atat => msprobe}/core/data_dump/data_processor/base.py (87%) rename debug/accuracy_tools/{atat => msprobe}/core/data_dump/data_processor/factory.py (89%) create mode 100644 debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py rename debug/accuracy_tools/{atat => msprobe}/core/data_dump/data_processor/pytorch_processor.py (87%) rename debug/accuracy_tools/{atat => msprobe}/core/data_dump/json_writer.py (85%) rename debug/accuracy_tools/{atat => msprobe}/core/data_dump/scope.py (98%) create mode 100644 debug/accuracy_tools/msprobe/mindspore/__init__.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/common/log.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/common/utils.py rename debug/accuracy_tools/{atat => msprobe}/mindspore/debugger/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/debugger/debugger_config.py (69%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/debugger/precision_debugger.py (31%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/doc/dump.md (81%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/dump/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/dump/api_kbk_dump.py (91%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/dump/dump_tool_factory.py (82%) create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/api_registry.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/hook_cell.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_functional.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_tensor.py rename debug/accuracy_tools/{atat => msprobe}/mindspore/dump/kernel_graph_dump.py (92%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/ms_config.py (67%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/overflow_check/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/overflow_check/kernel_graph_overflow_check.py (89%) rename debug/accuracy_tools/{atat => msprobe}/mindspore/overflow_check/overflow_check_tool_factory.py (81%) create mode 100644 debug/accuracy_tools/msprobe/mindspore/service.py rename debug/accuracy_tools/{atat => msprobe}/mindspore/task_handler_factory.py (68%) rename debug/accuracy_tools/{atat/atat.py => msprobe/msprobe.py} (82%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/advisor/advisor.py (93%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/advisor/advisor_const.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/advisor/advisor_result.py (90%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/.keep (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/common/.keep (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/common/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/common/config.py (58%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/common/utils.py (96%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/algorithm.py (98%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/api_precision_compare.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/compare.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/compare_column.py (98%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/compare/compare_utils.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/config.yaml (77%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/.keep (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/data_generate.py (95%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py (86%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py (85%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/run_ut.py (85%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json (100%) create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/__init__.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/apply_adam_w.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/confusion_transpose.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/fast_gelu.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/layer_norm_eval.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/linear.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/matmul_backward.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/npu_fusion_attention.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/rms_norm.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/rotary_mul.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/scaled_mask_softmax.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/bench_functions/swiglu.py rename debug/accuracy_tools/{atat => msprobe}/pytorch/common/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/common/compare_script.template (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/common/log.py (81%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/common/parse_json.py (89%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/common/utils.py (87%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/compare/acc_compare.py (95%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/compare/distributed_compare.py (85%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/compare/highlight.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/compare/mapping.yaml (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/compare/match.py (91%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/compare/npy_compare.py (98%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/debugger/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/debugger/debugger_config.py (89%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/debugger/precision_debugger.py (77%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/FAQ.md (72%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/api_accuracy_checker.md (84%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/dump.md (66%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/BLOOM-7B_1.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/BLOOM-7B_2.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/BLOOM-7B_3.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/BLOOM-7B_4.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_1.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_2.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_3.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_4.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_5.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_6.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_7.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/GPT-3_8.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/YOLOV5S_1.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/YOLOV5S_2.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/accuracy_checking_details.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/accuracy_checking_result.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/api_precision_compare_details.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/api_precision_compare_result.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/auto_analyze_log.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/compare_result_pkl.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/compare_result_pkl_md5.png.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/cpu_info.png (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/img/module_compare.png (100%) rename "debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" => "debug/accuracy_tools/msprobe/pytorch/doc/msprobe\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/parse_tool.md (98%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/ptdbg_ascend_compare.md (99%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/ptdbg_ascend_overview.md (81%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/ptdbg_ascend_quickstart.md (94%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/doc/run_overflow_check.md (95%) rename "debug/accuracy_tools/atat/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" => "debug/accuracy_tools/msprobe/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" (95%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/__init__.py (43%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/common/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/common/constant.py (89%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/common/counter.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/common/enums.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/common/params.py (93%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/common/utils.py (92%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/compare/grad_saver.py (89%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/compare/single_benchmark.py (89%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/main.py (81%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/base_layer.py (78%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/layer_factory.py (62%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/npu/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py (78%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py (80%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/npu/change_value.py (81%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py (83%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/npu/no_change.py (64%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py (90%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/perturbed_layers/run_cpu.py (52%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/result_handlers/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/result_handlers/base_handler.py (64%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/result_handlers/check_handler.py (68%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/result_handlers/fix_handler.py (60%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/result_handlers/handler_factory.py (59%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/free_benchmark/result_handlers/preheat_handler.py (88%) create mode 100644 debug/accuracy_tools/msprobe/pytorch/function_factory.py rename debug/accuracy_tools/{atat => msprobe}/pytorch/functional/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/functional/data_processor.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/functional/dump_module.py (73%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/api_registry.py (91%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/hook_module.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/support_wrap_ops.yaml (99%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/utils.py (95%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/wrap_aten.py (71%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/wrap_distributed.py (91%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/wrap_functional.py (94%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/wrap_npu_custom.py (71%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/wrap_tensor.py (89%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/wrap_torch.py (91%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/hook_module/wrap_vf.py (88%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/module_processer.py (72%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/online_dispatch/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/online_dispatch/compare.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/online_dispatch/dispatch.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/online_dispatch/dump_compare.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/online_dispatch/single_compare.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/online_dispatch/torch_ops_config.yaml (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/online_dispatch/utils.py (97%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse.py (50%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/cli.py (89%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/__init__.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/compare.py (92%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/config.py (98%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/file_desc.py (100%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/interactive_cli.py (93%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/parse_exception.py (96%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/parse_tool.py (95%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/utils.py (94%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/parse_tool/lib/visualization.py (94%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/pt_config.py (58%) rename debug/accuracy_tools/{atat => msprobe}/pytorch/service.py (82%) rename debug/accuracy_tools/{atat/test/core_ut => msprobe/test/core_ut/common}/test_utils.py (90%) create mode 100644 debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_data_collector.py rename debug/accuracy_tools/{atat => msprobe}/test/core_ut/data_dump/test_json_writer.py (97%) create mode 100644 debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_scope.py rename debug/accuracy_tools/{atat => msprobe}/test/core_ut/test_common_config.py (83%) rename debug/accuracy_tools/{atat => msprobe}/test/core_ut/test_file_check.py (85%) rename debug/accuracy_tools/{atat => msprobe}/test/core_ut/test_log.py (92%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_api_kbk_dump.py (75%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_debugger_config.py (87%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_dump_tool_factory.py (89%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_kernel_graph_dump.py (80%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_kernel_graph_overflow_check.py (76%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_ms_config.py (83%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_overflow_check_tool_factory.py (88%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_precision_debugger.py (79%) rename debug/accuracy_tools/{atat => msprobe}/test/mindspore_ut/test_task_handler_factory.py (82%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/advisor/test_advisor.py (85%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py (96%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/common/test_config.py (92%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py (98%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py (94%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py (96%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py (68%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py (88%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json (100%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json (100%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py (96%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py (83%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py (95%) create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_acc_compare.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_match.py rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py (94%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py (91%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/free_benchmark/test_main.py (92%) create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/functional/test_dump_module.py rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_api_registry.py (91%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_hook_module.py (94%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_wrap_aten.py (96%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_wrap_distributed.py (95%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_wrap_functional.py (91%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_wrap_tensor.py (88%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_wrap_torch.py (96%) rename debug/accuracy_tools/{atat => msprobe}/test/pytorch_ut/hook_module/test_wrap_vf.py (82%) create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/test_pt_config.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/test_service.py rename debug/accuracy_tools/{atat => msprobe}/test/resources/advisor.txt (100%) rename debug/accuracy_tools/{atat => msprobe}/test/resources/compare_result_20230703104808.csv (100%) rename debug/accuracy_tools/{atat => msprobe}/test/resources/compare_result_without_accuracy.csv (100%) rename debug/accuracy_tools/{atat => msprobe}/test/resources/config.yaml (100%) rename debug/accuracy_tools/{atat => msprobe}/test/resources/npu_test.pkl (100%) rename debug/accuracy_tools/{atat => msprobe}/test/run_test.sh (100%) rename debug/accuracy_tools/{atat => msprobe}/test/run_ut.py (97%) rename debug/accuracy_tools/{atat => msprobe}/test/test_module_processer.py (95%) diff --git a/debug/accuracy_tools/atat/mindspore/__init__.py b/debug/accuracy_tools/atat/mindspore/__init__.py deleted file mode 100644 index bb3f9356754..00000000000 --- a/debug/accuracy_tools/atat/mindspore/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from atat.mindspore.debugger.precision_debugger import PrecisionDebugger diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/compare/test_acc_compare.py b/debug/accuracy_tools/atat/test/pytorch_ut/compare/test_acc_compare.py deleted file mode 100644 index 5a82289a000..00000000000 --- a/debug/accuracy_tools/atat/test/pytorch_ut/compare/test_acc_compare.py +++ /dev/null @@ -1,17 +0,0 @@ -# coding=utf-8 -import unittest -from atat.pytorch.compare.acc_compare import rename_api - -class TestUtilsMethods(unittest.TestCase): - - def test_rename_api(self): - test_name_1 = "Distributed.broadcast.0.forward.input.0" - expect_name_1 = "Distributed.broadcast.input.0" - actual_name_1 = rename_api(test_name_1, "forward") - self.assertEqual(actual_name_1, expect_name_1) - - test_name_2 = "Torch.sum.0.backward.output.0" - expect_name_2 = "Torch.sum.output.0" - actual_name_2 = rename_api(test_name_2, "backward") - self.assertEqual(actual_name_2, expect_name_2) - \ No newline at end of file diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/test_pt_config.py b/debug/accuracy_tools/atat/test/pytorch_ut/test_pt_config.py deleted file mode 100644 index fa52fe0e1b0..00000000000 --- a/debug/accuracy_tools/atat/test/pytorch_ut/test_pt_config.py +++ /dev/null @@ -1,38 +0,0 @@ -from unittest import TestCase -from unittest.mock import patch, mock_open - -from atat.core.common.const import Const -from atat.pytorch.pt_config import parse_json_config - - -class TestPtConfig(TestCase): - def test_parse_json_config(self): - mock_json_data = { - "task": "statistics", - "dump_path": "./dump/", - "rank": [], - "step": [], - "level": "L1", - "seed": 1234, - "statistics": { - "scope": [], - "list": [], - "data_mode": ["all"], - }, - "tensor": { - "file_format": "npy" - } - } - with patch("atat.pytorch.pt_config.os.path.join", return_value="/path/config.json"), \ - patch("atat.pytorch.pt_config.FileOpen", mock_open(read_data='')), \ - patch("atat.pytorch.pt_config.json.load", return_value=mock_json_data): - common_config, task_config = parse_json_config(None, None) - self.assertEqual(common_config.task, Const.STATISTICS) - self.assertEqual(task_config.data_mode, ["all"]) - - with patch("atat.pytorch.pt_config.os.path.join", return_value="/path/config.json"), \ - patch("atat.pytorch.pt_config.FileOpen", mock_open(read_data='')), \ - patch("atat.pytorch.pt_config.json.load", return_value=mock_json_data): - common_config, task_config = parse_json_config(None, Const.TENSOR) - self.assertEqual(common_config.task, Const.STATISTICS) - self.assertEqual(task_config.file_format, "npy") diff --git a/debug/accuracy_tools/atat/README.md b/debug/accuracy_tools/msprobe/README.md similarity index 64% rename from debug/accuracy_tools/atat/README.md rename to debug/accuracy_tools/msprobe/README.md index de7d74f8f29..42743c50781 100644 --- a/debug/accuracy_tools/atat/README.md +++ b/debug/accuracy_tools/msprobe/README.md @@ -1,11 +1,26 @@ # MindStudio精度调试工具 -MindStudio精度调试工具(ascend_training_accuracy_tools),简称atat,是MindStudio Training Tools工具链下精度调试部分的工具包。主要包括精度预检和精度比对等子工具,当前适配场景包括PyTorch和MindSpore。 +MindStudio精度调试工具(MindStudio Probe),简称msprobe,是MindStudio Training Tools工具链下精度调试部分的工具包。主要包括精度预检和精度比对等子工具,当前适配场景包括PyTorch和MindSpore。 ## 工具安装 -精度工具合一软件包名称:`ascend_training_accuracy_tools-{version}-py3-none-any.whl` +精度工具合一软件包名称:`mindstudio_probe-{version}-py3-none-any.whl` +### pip安装 + ```shell + pip install mindstudio-probe + ``` +使用`pip install mindstudio-probe==版本号`可安装指定版本的包。 + +pip命令会自动安装最新的包及其配套依赖。 + +提示如下信息则表示安装成功。 + +```bash +Successfully installed mindstudio_probe-{version} +``` + +### 下载whl包安装 1. 使用pip命令安装numpy、openpyxl、pandas、PyYAML、rich、torch、tqdm依赖。 若环境中已安装部分依赖,不需要重复安装。 @@ -16,6 +31,7 @@ MindStudio精度调试工具(ascend_training_accuracy_tools),简称atat, | 版本 | 发布日期 | 支持PyTorch版本 | 下载链接 | 校验码 | | ----- | ---------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | + | 1.0.1 | 2024-07-25 | 2.0/2.1/2.2 | [mindstudio_probe-1.0.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.1-py3-none-any.whl) | b699e224e4d4e3bcf9412c54fa858a1ee370f0d7a2bc69cb3f1273ac14a6dc82 | | 1.0 | 2024-07-09 | 2.0/2.1/2.2 | [ascend_training_accuracy_tools-1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/1.0/ascend_training_accuracy_tools-1.0-py3-none-any.whl) | 5016dfe886c5d340ec6f60a959673355855f313c91f100680da814efb49f8e81 | | 0.0.3 | 2024-06-11 | 2.0/2.1/2.2 | [ascend_training_accuracy_tools-0.0.3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/0.0/ascend_training_accuracy_tools-0.0.3-py3-none-any.whl) | f46d9714704859e2d67861a65bbb3c76b0a250cf6e238b978b5b959ab1fe125a | | 0.0.2 | 2024-05-23 | 2.0/2.1/2.2 | [ascend_training_accuracy_tools-0.0.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/0.0/ascend_training_accuracy_tools-0.0.2-py3-none-any.whl) | 2e35809bde559e9c4d2f16a02ccde779ed9e436bb65fded0b7ebaf6ac2c88d93 | @@ -43,25 +59,79 @@ MindStudio精度调试工具(ascend_training_accuracy_tools),简称atat, 4. 执行如下命令进行安装。 ```bash - pip3 install ./ascend_training_accuracy_tools-{version}-py3-none-any.whl + pip3 install ./mindstudio_probe-{version}-py3-none-any.whl ``` 若为覆盖安装,请在命令行末尾增加“--force-reinstall”参数强制安装,例如: ```bash - pip3 install ./ascend_training_accuracy_tools-{version}-py3-none-any.whl --force-reinstall + pip3 install ./mindstudio_probe-{version}-py3-none-any.whl --force-reinstall ``` 提示如下信息则表示安装成功。 ```bash - Successfully installed ascend_training_accuracy_tools-{version} + Successfully installed mindstudio_probe-{version} + ``` + +### 从源码安装 +1. 克隆或者下载项目源代码 + + ```shell + git clone https://gitee.com/ascend/mstt.git + cd debug/accuracy_tools + ``` + +2. 安装setuptools和wheel + + ```shell + pip install setuptools wheel + ``` + +3. 安装msprobe + + ```shell + python setup.py install ``` + 提示出现如下信息则表示源码安装成功。 + ```shell + Finished processing dependencies for mindstudio-probe=={version} + ``` + +### 查看msprobe工具信息 + +执行如下命令查看msprobe工具信息。 + +```bash +pip show mindstudio-probe +``` + +输出结果如下示例: + +```bash +Name: mindstudio-probe +Version: 1.0 +Summary: This is a pytorch precision comparison tools +Home-page: +Author: +Author-email: +License: +Location: /home/xx/anaconda3/envs/pt21py38/lib/python3.8/site-packages +Requires: numpy, openpyxl, pandas, pyyaml, rich, tqdm, wheel +Required-by: +``` + +关键字段含义: +- Name:工具名称。 +- Version:工具版本号。 +- Summary:工具概述。 +- Location:工具安装路径。 +- Requires:工具依赖。 ## 工具使用 -安装atat工具后,可以按照如下思路选择合适的子工具进行精度调试: +安装msprobe工具后,可以按照如下思路选择合适的子工具进行精度调试: 1. 判断框架场景。 @@ -107,32 +177,32 @@ MindStudio精度调试工具(ascend_training_accuracy_tools),简称atat, MindSpore场景:暂不支持。 -上述流程中的工具均为atat工具的子工具,使用相同的命令行,格式如下: +上述流程中的工具均为msprobe工具的子工具,使用相同的命令行,格式如下: 精度预检工具 ```bash -atat -f run_ut [-h] +msprobe -f run_ut [-h] ``` ```bash -atat -f multi_run_ut [-h] +msprobe -f multi_run_ut [-h] ``` ```bash -atat -f api_precision_compare [-h] +msprobe -f api_precision_compare [-h] ``` 溢出解析工具 ```bash -atat -f run_overflow_check [-h] +msprobe -f run_overflow_check [-h] ``` 数据解析工具 ```bash -atat -f parse [-h] +msprobe -f parse [-h] ``` | 参数 | 说明 | diff --git a/debug/accuracy_tools/atat/__init__.py b/debug/accuracy_tools/msprobe/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/__init__.py rename to debug/accuracy_tools/msprobe/__init__.py diff --git a/debug/accuracy_tools/atat/config/README.md b/debug/accuracy_tools/msprobe/config/README.md similarity index 89% rename from debug/accuracy_tools/atat/config/README.md rename to debug/accuracy_tools/msprobe/config/README.md index a998704993a..7d11a365253 100644 --- a/debug/accuracy_tools/atat/config/README.md +++ b/debug/accuracy_tools/msprobe/config/README.md @@ -2,13 +2,38 @@ 当前配置文件主要为PrecisionDebugger接口执行dump或无标杆比对操作时调用的配置,当PrecisionDebugger接口未指定该配置文件时,使用该文件的默认配置。配置文件详见[config.json](./config.json)。 +当在环境上安装msprobe工具后,config.json文件位置可通过如下方式查找: + +查找msprobe工具安装路径。 + +``` +pip show mindstudio-probe +``` + +输出结果如下示例: + +``` +Name: mindstudio-probe +Version: 1.0 +Summary: This is a pytorch precision comparison tools +Home-page: +Author: +Author-email: +License: +Location: /home/xx/anaconda3/envs/pt21py38/lib/python3.8/site-packages +Requires: numpy, openpyxl, pandas, pyyaml, rich, tqdm, wheel +Required-by: +``` + +Location字段为msprobe工具的安装路径,那么config.json文件位置为/home/xx/anaconda3/envs/pt21py38/lib/python3.8/site-packages/msprobe/config + ## 参数说明 ### **通用配置参数** | 参数名 | 说明 | 是否必选 | | ----------------- | ------------------------------------------------------------ | -------- | -| task | dump的任务类型,str类型。可取值"free_benchmark"(无标杆比对,仅PyTorch场景支持)、"statistics"(仅dump API统计信息,默认值)、"tensor"(dump API统计信息和完全复刻整网的API运行情况的真实数据)、"overflow_check"(溢出检测)。配置示例:"task": "tensor"。根据task参数取值的不同,可以配置不同场景参数,详见:“**task配置为free_benchmark**”,“**task配置为statistics**”,“**task配置为tensor**”,“**task配置为overflow_check**”。 | 否 | +| task | dump的任务类型,str类型。可取值:
"free_benchmark"(无标杆比对,仅PyTorch场景支持)。
"statistics"(仅dump API统计信息,默认值)。
"tensor"(dump API统计信息和完全复刻整网的API运行情况的真实数据)。
"overflow_check"(溢出检测,仅PyTorch和MindSpore静态图场景支持)。
"run_ut"(精度预检配置,仅PyTorch场景支持)。
配置示例:"task": "tensor"。
根据task参数取值的不同,可以配置不同场景参数,详见:“**task配置为free_benchmark**”,“**task配置为statistics**”,“**task配置为tensor**”,“**task配置为overflow_check**”,“**task配置为run_ut**”。 | 否 | | dump_path | 设置dump数据目录路径,str类型。配置示例:"dump_path": "./dump_path"。MindSpore场景仅支持绝对路径。 | 是 | | rank | 指定对某张卡上的数据进行dump,list[int]类型,默认未配置(表示dump所有卡的数据),应配置为大于等于0的整数,且须配置实际可用的Rank ID。配置示例:"rank": [1]。
对于PyTorch场景,Rank ID从0开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的Rank ID,则dump数据为空,比如当前环境Rank ID为0到7,实际训练运行0到3卡,此时若配置Rank ID为4或不存在的10等其他值,此时dump数据为空。
对于MindSpore场景,所有节点的Rank ID均从0开始计数,最大取值为每个节点可用卡总数-1,config.json配置一次rank参数对所有节点同时生效。 | 否 | | step | 指定dump某个step的数据,list[int]类型。默认未配置,表示dump所有step数据。dump特定step时,须指定为训练脚本中存在的step。step为list格式,可配置逐个step,例如:"step": [0,1,2]。 | 否 | @@ -85,6 +110,18 @@ task配置为free_benchmark时,开启**无标杆比对**,在NPU环境下通 | overflow_nums | 控制溢出次数,int类型,仅PyTorch场景支持,表示第N次溢出时,停止训练,过程中检测到溢出API对应kernel数据均dump。配置示例:"overflow_nums": 3。默认为1,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。 | 否 | | check_mode | MindSpore场景kernel级别的溢出检测,str类型,可取值"aicore"(开启AI Core的溢出检测)、"atomic"(开启Atomic的溢出检测)、"all"(开启AI Core和Atomic的溢出检测,默认值)。配置示例"check_mode": "aicore"。 | 否 | +### task配置为run_ut + +仅PyTorch场景支持。 + +| 参数名称 | 说明 | 是否必选 | +| --------------- | ------------------------------------------------------------ | -------- | +| white_list | API dump白名单,仅对指定的API进行dump。配置示例:"white_list": ["conv1d", "conv2d"]。默认未配置白名单,即dump全量API数据。 | 否 | +| black_list | API dump黑名单,被指定的API不进行dump。配置示例:"black_list": ["conv1d", "conv2d"]。默认未配置黑名单,即dump全量API数据。 | 否 | +| error_data_path | 配置保存精度未达标的API输入输出数据路径,默认为当前路径。配置示例"error_data_path": "./"。 | 否 | + +说明:white_list和black_list同时配置时,二者配置的API名单若无交集,则白名单生效,若API名单存在交集,则白名单排除的部分以及交集的API不进行dump。 + ## 配置示例 以下示例包含当前支持的所有场景可配置的完整参数。 @@ -180,6 +217,27 @@ task配置为free_benchmark时,开启**无标杆比对**,在NPU环境下通 } ``` +### PyTorch场景task配置为run_ut + +```json +{ + "task": "run_ut", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + "enable_dataloader": false, + + "run_ut": { + "white_list": [], + "black_list": [], + "error_data_path": "./" + } +} +``` + ### MindSpore场景task配置为statistics ```json @@ -394,4 +452,4 @@ train_loader = torch.utils.data.DataLoader( 关闭dropout: -在使用from atat.pytorch import PrecisionDebugger后,工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。 +在使用from msprobe.pytorch import PrecisionDebugger后,工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。 diff --git a/debug/accuracy_tools/atat/config/config.json b/debug/accuracy_tools/msprobe/config/config.json similarity index 84% rename from debug/accuracy_tools/atat/config/config.json rename to debug/accuracy_tools/msprobe/config/config.json index 70a630a40af..c6077b75aef 100644 --- a/debug/accuracy_tools/atat/config/config.json +++ b/debug/accuracy_tools/msprobe/config/config.json @@ -24,5 +24,10 @@ "overflow_check": { "overflow_nums": 1, "check_mode":"all" + }, + "run_ut": { + "white_list": [], + "black_list": [], + "error_data_path": "./" } } \ No newline at end of file diff --git a/debug/accuracy_tools/atat/config/img/free_benchmark.png b/debug/accuracy_tools/msprobe/config/img/free_benchmark.png similarity index 100% rename from debug/accuracy_tools/atat/config/img/free_benchmark.png rename to debug/accuracy_tools/msprobe/config/img/free_benchmark.png diff --git a/debug/accuracy_tools/atat/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py similarity index 91% rename from debug/accuracy_tools/atat/core/common/const.py rename to debug/accuracy_tools/msprobe/core/common/const.py index dea829c3ffa..c1a453a21a6 100644 --- a/debug/accuracy_tools/atat/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -2,6 +2,7 @@ import os import stat import numpy as np + class Const: """ Class for const @@ -15,6 +16,10 @@ class Const: OFF = 'OFF' BACKWARD = 'backward' FORWARD = 'forward' + DEFAULT_LIST = [] + DEFAULT_PATH = './' + WHITE_LIST = 'white_list' + BLACK_LIST = 'black_list' # dump mode ALL = "all" @@ -25,6 +30,8 @@ class Const: API_LIST = "api_list" API_STACK = "api_stack" DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK] + AUTO = "auto" + ONLINE_DUMP_MODE = [ALL, LIST, AUTO, OFF] SUMMARY = "summary" MD5 = "md5" SUMMARY_MODE = [ALL, SUMMARY, MD5] @@ -35,8 +42,10 @@ class Const: PKL_SUFFIX = ".pkl" NUMPY_SUFFIX = ".npy" + PT_SUFFIX = ".pt" ONE_GB = 1073741824 # 1 * 1024 * 1024 * 1024 TEN_GB = 10737418240 # 10 * 1024 * 1024 * 1024 + ONE_MB = 1048576 # 1 * 1024 * 1024 FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$' DISTRIBUTED_PREFIX_LENGTH = 60 # env dump path @@ -52,13 +61,15 @@ class Const: ENV_ENABLE = "1" ENV_DISABLE = "0" MAX_SEED_VALUE = 4294967295 # 2**32 - 1 - TASK_LIST = ["tensor", "statistics", "overflow_check", "free_benchmark"] + TASK_LIST = ["tensor", "statistics", "overflow_check", "free_benchmark", "run_ut"] LEVEL_LIST = ["L0", "L1", "L2", "mix"] STATISTICS = "statistics" TENSOR = "tensor" OVERFLOW_CHECK = "overflow_check" FREE_BENCHMARK = "free_benchmark" + RUN_UT = "run_ut" ATTR_NAME_PREFIX = "wrap_" + ATTR_NAME_PREFIX_LEN = len(ATTR_NAME_PREFIX) KERNEL_DUMP = "kernel_dump" DATA = "data" PT_FRAMEWORK = "pytorch" @@ -70,12 +81,12 @@ class Const: INT_TYPE = [np.int32, np.int64] NPU = 'NPU' DISTRIBUTED = 'Distributed' - + INPLACE_LIST = [ "broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter", - "_reduce_scatter_base", "_all_gather_base", "send", "recv", "irecv", "isend", "all_to_all_single" + "_reduce_scatter_base", "_all_gather_base", "send", "recv", "irecv", "isend", "all_to_all_single", "all_to_all" ] - + CONVERT = { "int32_to_int64": ["torch.int32", "torch.int64"], } @@ -84,6 +95,7 @@ class Const: "int32_to_int64": ["cross_entropy"] } + class CompareConst: """ Class for compare module const @@ -196,6 +208,7 @@ class CompareConst: MAX_RELATIVE_OUT_YELLOW = 0.1 MAX_RELATIVE_IN_YELLOW = 0.01 + class FileCheckConst: """ Class for file check const @@ -232,6 +245,7 @@ class FileCheckConst: YAML_SUFFIX: MAX_YAML_SIZE } + class OverflowConst: """ Class for Overflow @@ -239,3 +253,17 @@ class OverflowConst: OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE" OVERFLOW_ORIGINAL_MODE = 0 OVERFLOW_DEBUG_MODE = 1 + + +class MsConst: + CELL = "cell" + API = "api" + KERNEL = "kernel" + TOOL_LEVEL_DICT = { + "L0": CELL, + "L1": API, + "L2": KERNEL + } + PYNATIVE_MODE = "pynative" + GRAPH_GE_MODE = "graph_ge" + GRAPH_KBYK_MODE = "graph_kbyk" diff --git a/debug/accuracy_tools/atat/core/common/exceptions.py b/debug/accuracy_tools/msprobe/core/common/exceptions.py similarity index 52% rename from debug/accuracy_tools/atat/core/common/exceptions.py rename to debug/accuracy_tools/msprobe/core/common/exceptions.py index f6b7c19ba32..ea61f8cd58f 100644 --- a/debug/accuracy_tools/atat/core/common/exceptions.py +++ b/debug/accuracy_tools/msprobe/core/common/exceptions.py @@ -1,19 +1,20 @@ class CodedException(Exception): def __init__(self, code, error_info=''): super().__init__() + self.code = code self.error_info = self.err_strs.get(code) + error_info def __str__(self): return self.error_info -class MsaccException(CodedException): +class MsprobeException(CodedException): INVALID_PARAM_ERROR = 0 OVERFLOW_NUMS_ERROR = 1 err_strs = { - INVALID_PARAM_ERROR: "[msacc] 无效参数: ", - OVERFLOW_NUMS_ERROR: "[msacc] 超过预设溢出次数 当前溢出次数:" + INVALID_PARAM_ERROR: "[msprobe] 无效参数: ", + OVERFLOW_NUMS_ERROR: "[msprobe] 超过预设溢出次数 当前溢出次数:" } @@ -26,12 +27,12 @@ class FileCheckException(CodedException): FILE_TOO_LARGE_ERROR = 5 err_strs = { - SOFT_LINK_ERROR: "[msacc] 检测到软链接: ", - FILE_PERMISSION_ERROR: "[msacc] 文件权限错误: ", - INVALID_FILE_ERROR: "[msacc] 无效文件: ", - ILLEGAL_PATH_ERROR: "[msacc] 非法文件路径: ", - ILLEGAL_PARAM_ERROR: "[msacc] 非法打开方式: ", - FILE_TOO_LARGE_ERROR: "[msacc] 文件过大: " + SOFT_LINK_ERROR: "[msprobe] 检测到软链接: ", + FILE_PERMISSION_ERROR: "[msprobe] 文件权限错误: ", + INVALID_FILE_ERROR: "[msprobe] 无效文件: ", + ILLEGAL_PATH_ERROR: "[msprobe] 非法文件路径: ", + ILLEGAL_PARAM_ERROR: "[msprobe] 非法打开方式: ", + FILE_TOO_LARGE_ERROR: "[msprobe] 文件过大: " } @@ -39,8 +40,8 @@ class ParseJsonException(CodedException): UnexpectedNameStruct = 0 InvalidDumpJson = 1 err_strs = { - UnexpectedNameStruct: "[msacc] Unexpected name in json: ", - InvalidDumpJson: "[msacc] json格式不正确: ", + UnexpectedNameStruct: "[msprobe] Unexpected name in json: ", + InvalidDumpJson: "[msprobe] json格式不正确: ", } @@ -49,23 +50,23 @@ class ScopeException(CodedException): InvalidScope = 1 ArgConflict = 2 err_strs = { - InvalidApiStr: "[msacc] Invalid api_list: ", - InvalidScope: "[msacc] Invalid scope: ", - ArgConflict: "[msacc] Scope and api_list conflict: ", + InvalidApiStr: "[msprobe] Invalid api_list: ", + InvalidScope: "[msprobe] Invalid scope: ", + ArgConflict: "[msprobe] Scope and api_list conflict: ", } class RepairException(CodedException): InvalidRepairType = 0 err_strs = { - InvalidRepairType: "[msacc] Invalid repair_type: " + InvalidRepairType: "[msprobe] Invalid repair_type: " } class StepException(CodedException): InvalidPostProcess = 0 err_strs = { - InvalidPostProcess: "[msacc] 错误的step后处理配置: ", + InvalidPostProcess: "[msprobe] 错误的step后处理配置: ", } @@ -73,8 +74,8 @@ class FreeBenchmarkException(CodedException): UnsupportedType = 0 InvalidGrad = 1 err_strs = { - UnsupportedType: "[msacc] Free benchmark get unsupported type: ", - InvalidGrad: "[msacc] Free benchmark gradient invalid: ", + UnsupportedType: "[msprobe] Free benchmark get unsupported type: ", + InvalidGrad: "[msprobe] Free benchmark gradient invalid: ", } diff --git a/debug/accuracy_tools/atat/core/common/file_check.py b/debug/accuracy_tools/msprobe/core/common/file_check.py similarity index 98% rename from debug/accuracy_tools/atat/core/common/file_check.py rename to debug/accuracy_tools/msprobe/core/common/file_check.py index 2df825aa351..36896cfbc19 100644 --- a/debug/accuracy_tools/atat/core/common/file_check.py +++ b/debug/accuracy_tools/msprobe/core/common/file_check.py @@ -17,9 +17,9 @@ import os import re -from atat.core.common.log import logger -from atat.core.common.exceptions import FileCheckException -from atat.core.common.const import FileCheckConst +from msprobe.core.common.log import logger +from msprobe.core.common.exceptions import FileCheckException +from msprobe.core.common.const import FileCheckConst class FileChecker: diff --git a/debug/accuracy_tools/atat/core/common/log.py b/debug/accuracy_tools/msprobe/core/common/log.py similarity index 100% rename from debug/accuracy_tools/atat/core/common/log.py rename to debug/accuracy_tools/msprobe/core/common/log.py diff --git a/debug/accuracy_tools/atat/core/common/utils.py b/debug/accuracy_tools/msprobe/core/common/utils.py similarity index 97% rename from debug/accuracy_tools/atat/core/common/utils.py rename to debug/accuracy_tools/msprobe/core/common/utils.py index 088530f3c5c..6e901deb9eb 100644 --- a/debug/accuracy_tools/atat/core/common/utils.py +++ b/debug/accuracy_tools/msprobe/core/common/utils.py @@ -26,9 +26,9 @@ from datetime import datetime, timezone from pathlib import Path import numpy as np -from atat.core.common.file_check import FileOpen, FileChecker -from atat.core.common.const import Const, FileCheckConst, CompareConst, OverflowConst -from atat.core.common.log import logger +from msprobe.core.common.file_check import FileOpen, FileChecker +from msprobe.core.common.const import Const, FileCheckConst, CompareConst, OverflowConst +from msprobe.core.common.log import logger device = collections.namedtuple('device', ['type', 'index']) @@ -148,7 +148,7 @@ def check_summary_only_valid(summary_only): return summary_only -def check_compare_param(input_parma, output_path, stack_mode=False, summary_compare=False, md5_compare=False): +def check_compare_param(input_parma, output_path, summary_compare=False, md5_compare=False): if not (isinstance(input_parma, dict) and isinstance(output_path, str)): logger.error("Invalid input parameters") raise CompareException(CompareException.INVALID_PARAM_ERROR) @@ -318,15 +318,6 @@ def execute_command(cmd): raise CompareException(CompareException.INVALID_DATA_ERROR) -def save_numpy_data(file_path, data): - """ - save_numpy_data - """ - if not os.path.exists(os.path.dirname(file_path)): - os.makedirs(os.path.dirname(file_path)) - np.save(file_path, data) - - def parse_value_by_comma(value): """ parse value by comma, like '1,2,4,8' diff --git a/debug/accuracy_tools/atat/core/common_config.py b/debug/accuracy_tools/msprobe/core/common_config.py similarity index 54% rename from debug/accuracy_tools/atat/core/common_config.py rename to debug/accuracy_tools/msprobe/core/common_config.py index e256372ca87..d6c15e101e7 100644 --- a/debug/accuracy_tools/atat/core/common_config.py +++ b/debug/accuracy_tools/msprobe/core/common_config.py @@ -1,6 +1,6 @@ -from atat.core.common.const import Const -from atat.core.common.log import logger -from atat.core.common.exceptions import MsaccException +from msprobe.core.common.const import Const +from msprobe.core.common.log import logger +from msprobe.core.common.exceptions import MsprobeException class CommonConfig: @@ -18,24 +18,27 @@ class CommonConfig: def _check_config(self): if self.task and self.task not in Const.TASK_LIST: - logger.error_log_with_exp( - "task is invalid, it should be one of {}".format(Const.TASK_LIST), MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("task is invalid, it should be one of {}".format(Const.TASK_LIST), + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if self.rank is not None and not isinstance(self.rank, list): - logger.error_log_with_exp("rank is invalid, it should be a list", MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("rank is invalid, it should be a list", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if self.step is not None and not isinstance(self.step, list): - logger.error_log_with_exp("step is invalid, it should be a list", MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("step is invalid, it should be a list", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if self.level and self.level not in Const.LEVEL_LIST: - logger.error_log_with_exp( - "level is invalid, it should be one of {}".format(Const.LEVEL_LIST), MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("level is invalid, it should be one of {}".format(Const.LEVEL_LIST), + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if self.seed is not None and not isinstance(self.seed, int): - logger.error_log_with_exp("seed is invalid, it should be an integer", MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("seed is invalid, it should be an integer", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if not isinstance(self.is_deterministic, bool): - logger.error_log_with_exp( - "is_deterministic is invalid, it should be a boolean", MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("is_deterministic is invalid, it should be a boolean", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if not isinstance(self.enable_dataloader, bool): - logger.error_log_with_exp( - "enable_dataloader is invalid, it should be a boolean", MsaccException(MsaccException.INVALID_PARAM_ERROR)) - + logger.error_log_with_exp("enable_dataloader is invalid, it should be a boolean", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) + class BaseConfig: def __init__(self, json_config): @@ -44,15 +47,17 @@ class BaseConfig: self.data_mode = json_config.get('data_mode') self.backward_input = json_config.get("backward_input") self.file_format = json_config.get("file_format") - self.summary_mode = json_config.get("summary_mode") - self.overflow_num = json_config.get("overflow_num") + self.summary_mode = json_config.get("summary_mode") + self.overflow_nums = json_config.get("overflow_nums") self.check_mode = json_config.get("check_mode") def check_config(self): if self.scope is not None and not isinstance(self.scope, list): - logger.error_log_with_exp("scope is invalid, it should be a list", MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("scope is invalid, it should be a list", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if self.list is not None and not isinstance(self.list, list): - logger.error_log_with_exp("list is invalid, it should be a list", MsaccException(MsaccException.INVALID_PARAM_ERROR)) + logger.error_log_with_exp("list is invalid, it should be a list", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) if self.data_mode is not None and not isinstance(self.data_mode, list): - logger.error_log_with_exp("data_mode is invalid, it should be a list", MsaccException(MsaccException.INVALID_PARAM_ERROR)) - + logger.error_log_with_exp("data_mode is invalid, it should be a list", + MsprobeException(MsprobeException.INVALID_PARAM_ERROR)) diff --git a/debug/accuracy_tools/atat/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py similarity index 85% rename from debug/accuracy_tools/atat/core/data_dump/data_collector.py rename to debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index f6a9a70b138..db437539afe 100644 --- a/debug/accuracy_tools/atat/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -1,11 +1,10 @@ - import os -from atat.core.data_dump.scope import build_scope, ListScope -from atat.core.data_dump.json_writer import DataWriter -from atat.core.common.log import logger -from atat.core.common.const import Const -from atat.core.data_dump.data_processor.factory import DataProcessorFactory +from msprobe.core.data_dump.scope import build_scope, ListScope +from msprobe.core.data_dump.json_writer import DataWriter +from msprobe.core.common.log import logger +from msprobe.core.common.const import Const +from msprobe.core.data_dump.data_processor.factory import DataProcessorFactory def build_data_collector(config): @@ -21,7 +20,8 @@ class DataCollector: self.config = config self.data_writer = DataWriter() self.data_processor = DataProcessorFactory.create_processor(self.config, self.data_writer) - self.module_processor = DataProcessorFactory.get_module_processor(self.config.framework) if self.config.framework == Const.PT_FRAMEWORK else None + self.module_processor = DataProcessorFactory.get_module_processor(self.config.framework) \ + if self.config.framework == Const.PT_FRAMEWORK else None self.module_count = {} if self.config.task == Const.FREE_BENCHMARK: self.scope = build_scope(ListScope, self.config.scope, self.config.list) @@ -35,7 +35,7 @@ class DataCollector: @property def dump_file_path(self): return self.data_writer.dump_file_path - + @staticmethod def check_scope_and_pid(scope, name, pid): return (not scope or scope.check(name)) and pid == os.getpid() @@ -43,10 +43,10 @@ class DataCollector: @staticmethod def is_inplace(module): return getattr(module, "op_is_inplace", False) - + def if_return_forward_new_output(self): return self.data_processor.if_return_forward_new_output() - + def get_forward_new_output(self): return self.data_processor.get_forward_new_output() @@ -88,8 +88,11 @@ class DataCollector: else: data_info = self.data_processor.analyze_forward_inplace(name, module_input_output) if self.config.level == "L2": - return + return self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name)) + if self.data_processor.stop_run(): + self.handle_data(name, data_info, use_buffer=False) + raise Exception("[msprobe] exit") self.handle_data(name, data_info) def backward_data_collect(self, name, module, pid, module_input_output): @@ -98,6 +101,9 @@ class DataCollector: return data_info = self.data_processor.analyze_backward(name, module, module_input_output) + if self.data_processor.stop_run(): + self.handle_data(name, data_info, use_buffer=False) + raise Exception("[msprobe] exit") self.handle_data(name, data_info) def update_construct(self, name): @@ -105,12 +111,15 @@ class DataCollector: self.data_writer.update_construct({name: self.module_processor.api_parent_node}) self.data_writer.update_construct(self.module_processor.module_node) - def handle_data(self, name, data_info): + def handle_data(self, name, data_info, use_buffer=True): msg = f"msProbe is collecting data on {name}. " if data_info: msg = self.update_data(data_info, msg) logger.info(msg) - self.data_writer.flush_data_when_buffer_is_full() + if use_buffer: + self.data_writer.flush_data_when_buffer_is_full() + else: + self.write_json() def module_count_func(self, name, name_template): module_name = name.split(Const.SEP)[-3] @@ -135,6 +144,6 @@ class DataCollector: def update_dump_paths(self, *args): self.data_writer.update_dump_paths(*args) self.data_writer.initialize_json_file(task=self.config.task, level=self.config.level) - + def update_iter(self, current_iter): self.data_processor.update_iter(current_iter) diff --git a/debug/accuracy_tools/atat/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py similarity index 87% rename from debug/accuracy_tools/atat/core/data_dump/data_processor/base.py rename to debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index 208c053192c..2fbc86b5656 100644 --- a/debug/accuracy_tools/atat/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -3,9 +3,9 @@ import inspect from dataclasses import dataclass from typing import Tuple, Dict, Optional, Any import numpy as np -from atat.core.common.log import logger -from atat.core.common.utils import convert_tuple -from atat.core.common.const import Const +from msprobe.core.common.log import logger +from msprobe.core.common.utils import convert_tuple +from msprobe.core.common.const import Const @dataclass @@ -35,11 +35,11 @@ class ModuleBackwardInputsOutputs: @property def grad_input_tuple(self): return convert_tuple(self.grad_input) - + @property def grad_output_tuple(self): - return convert_tuple(self.grad_output) - + return convert_tuple(self.grad_output) + class TensorStatInfo: def __init__(self, max_val=None, min_val=None, mean_val=None, norm_val=None): @@ -53,7 +53,7 @@ class BaseDataProcessor: _recursive_key_stack = [] special_type = (np.integer, np.floating, np.bool_, np.complexfloating, np.str_, np.byte, np.unicode_, bool, int, float, str, slice) - + def __init__(self, config, data_writer): self.data_writer = data_writer self.config = config @@ -65,11 +65,11 @@ class BaseDataProcessor: self.current_iter = 0 self._return_forward_new_output = False self._forward_new_output = None - + @property def data_path(self): return self.data_writer.dump_tensor_data_dir - + @staticmethod def analyze_api_call_stack(name): stack_str = [] @@ -87,7 +87,7 @@ class BaseDataProcessor: stack_str.append(stack_line) stack_info_struct = {name: stack_str} return stack_info_struct - + @staticmethod def _convert_numpy_to_builtin(arg): type_mapping = { @@ -103,26 +103,15 @@ class BaseDataProcessor: if isinstance(arg, numpy_type): return builtin_type(arg), type(arg).__name__ return arg, '' - + @staticmethod def _analyze_numpy(value, numpy_type): return {"type": numpy_type, "value": value} - - @staticmethod - def _analyze_builtin(arg): - single_arg = {} - if isinstance(arg, slice): - single_arg.update({"type": "slice"}) - single_arg.update({"value": [arg.start, arg.stop, arg.step]}) - else: - single_arg.update({"type": type(arg).__name__}) - single_arg.update({"value": arg}) - return single_arg - + @classmethod def get_special_types(cls): return cls.special_type - + @classmethod def recursive_apply_transform(cls, args, transform): if isinstance(args, cls.get_special_types()): @@ -142,9 +131,11 @@ class BaseDataProcessor: resutl_dict[k] = cls.recursive_apply_transform(arg, transform) cls._recursive_key_stack.pop() return resutl_dict - else: + elif args is not None: logger.warning(f"Data type {type(args)} is not supported.") return None + else: + return None def if_return_forward_new_output(self): return self._return_forward_new_output @@ -175,13 +166,14 @@ class BaseDataProcessor: return (Const.ALL in self.config.data_mode or forward_backward in self.config.data_mode or input_output in self.config.data_mode) - - def analyze_pre_forward(self, name, module,module_input_output: ModuleForwardInputsOutputs): + + def analyze_pre_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): pass - + def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): api_info_struct = {} - if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): # check whether data_mode contains forward or input + # check whether data_mode contains forward or input + if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): api_info_struct[name] = {} self.api_data_category = Const.INPUT args_info_list = self.analyze_element(module_input_output.args_tuple) @@ -190,13 +182,14 @@ class BaseDataProcessor: kwargs_info_list = self.analyze_element(module_input_output.kwargs) api_info_struct[name][Const.INPUT_KWARGS] = kwargs_info_list - if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT): # check whether data_mode contains forward or output + # check whether data_mode contains forward or output + if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT): api_info_struct[name] = api_info_struct.get(name, {}) self.api_data_category = Const.OUTPUT output_info_list = self.analyze_element(module_input_output.output_tuple) api_info_struct[name][Const.OUTPUT] = output_info_list return api_info_struct - + def analyze_pre_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs): api_info_struct = {} if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): @@ -208,7 +201,7 @@ class BaseDataProcessor: kwargs_info_list = self.analyze_element(module_input_output.kwargs) api_info_struct[name][Const.INPUT_KWARGS] = kwargs_info_list return api_info_struct - + def analyze_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs): concat_args = module_input_output.concat_args_and_kwargs() api_info_struct = {} @@ -218,26 +211,29 @@ class BaseDataProcessor: output_info_list = self.analyze_element(concat_args) api_info_struct[name][Const.OUTPUT] = output_info_list return api_info_struct - + def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs): api_info_struct = {} - if self.is_dump_for_data_mode(Const.BACKWARD, Const.OUTPUT): + if self.is_dump_for_data_mode(Const.BACKWARD, Const.INPUT): api_info_struct[name] = {} - self.api_data_category = Const.OUTPUT + self.api_data_category = Const.INPUT input_info_list = self.analyze_element(module_input_output.grad_input_tuple) - api_info_struct[name][Const.GRAD_INPUT] = input_info_list + api_info_struct[name][Const.INPUT] = input_info_list - if self.is_dump_for_data_mode(Const.BACKWARD, Const.INPUT): + if self.is_dump_for_data_mode(Const.BACKWARD, Const.OUTPUT): api_info_struct[name] = api_info_struct.get(name, {}) - self.api_data_category = Const.INPUT + self.api_data_category = Const.OUTPUT output_info_list = self.analyze_element(module_input_output.grad_output_tuple) - api_info_struct[name][Const.GRAD_OUTPUT] = output_info_list + api_info_struct[name][Const.OUTPUT] = output_info_list return api_info_struct def get_save_file_path(self, suffix): - file_format = "pt" if self.config.framework == Const.PT_FRAMEWORK else "npy" + file_format = Const.PT_SUFFIX if self.config.framework == Const.PT_FRAMEWORK else Const.NUMPY_SUFFIX dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + Const.SEP + - suffix + Const.SEP + file_format) + suffix + file_format) file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name) - return dump_data_name, file_path \ No newline at end of file + return dump_data_name, file_path + + def stop_run(self): + return False diff --git a/debug/accuracy_tools/atat/core/data_dump/data_processor/factory.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/factory.py similarity index 89% rename from debug/accuracy_tools/atat/core/data_dump/data_processor/factory.py rename to debug/accuracy_tools/msprobe/core/data_dump/data_processor/factory.py index bcc771f3684..ad74acdeeba 100644 --- a/debug/accuracy_tools/atat/core/data_dump/data_processor/factory.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/factory.py @@ -1,10 +1,10 @@ -from atat.core.common.const import Const +from msprobe.core.common.const import Const class DataProcessorFactory: _data_processor = {} _module_processor = {} - + @classmethod def register_processor(cls, framework, task, processor_class): key = (framework, task) @@ -13,7 +13,7 @@ class DataProcessorFactory: @classmethod def register_module_processor(cls, framework, processor_class): cls._module_processor[framework] = processor_class - + @classmethod def get_module_processor(cls, framework): processor_class = cls._module_processor.get(framework) @@ -39,7 +39,7 @@ class DataProcessorFactory: TensorDataProcessor as PytorchTensorDataProcessor, OverflowCheckDataProcessor as PytorchOverflowCheckDataProcessor, FreeBenchmarkDataProcessor as PytorchFreeBenchmarkDataProcessor, - KernelDumpDataProcessor as PytorchKernelDumpDataProcessor + KernelDumpDataProcessor as PytorchKernelDumpDataProcessor ) from ....pytorch.module_processer import ModuleProcesser cls.register_processor(Const.PT_FRAMEWORK, Const.STATISTICS, PytorchStatisticsDataProcessor) @@ -47,15 +47,13 @@ class DataProcessorFactory: cls.register_processor(Const.PT_FRAMEWORK, Const.OVERFLOW_CHECK, PytorchOverflowCheckDataProcessor) cls.register_processor(Const.PT_FRAMEWORK, Const.FREE_BENCHMARK, PytorchFreeBenchmarkDataProcessor) cls.register_processor(Const.PT_FRAMEWORK, Const.KERNEL_DUMP, PytorchKernelDumpDataProcessor) - cls.register_module_processor(Const.PT_FRAMEWORK, ModuleProcesser) + cls.register_module_processor(Const.PT_FRAMEWORK, ModuleProcesser) elif framework == Const.MS_FRAMEWORK: from .mindspore_processor import ( StatisticsDataProcessor as MindsporeStatisticsDataProcessor, TensorDataProcessor as MindsporeTensorDataProcessor, - OverflowCheckDataProcessor as MindsporeOverflowCheckDataProcessor, - FreeBenchmarkDataProcessor as MindsporeFreeBenchmarkDataProcessor + OverflowCheckDataProcessor as MindsporeOverflowCheckDataProcessor ) cls.register_processor(Const.MS_FRAMEWORK, Const.STATISTICS, MindsporeStatisticsDataProcessor) cls.register_processor(Const.MS_FRAMEWORK, Const.TENSOR, MindsporeTensorDataProcessor) cls.register_processor(Const.MS_FRAMEWORK, Const.OVERFLOW_CHECK, MindsporeOverflowCheckDataProcessor) - cls.register_processor(Const.MS_FRAMEWORK, Const.FREE_BENCHMARK, MindsporeFreeBenchmarkDataProcessor) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py new file mode 100644 index 00000000000..c208df7d900 --- /dev/null +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -0,0 +1,206 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import zlib +import mindspore as ms +from mindspore import ops +import numpy as np + +from msprobe.core.common.const import Const +from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo, + ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs) +from msprobe.core.common.file_check import path_len_exceeds_limit, change_mode, FileCheckConst +from msprobe.mindspore.dump.hook_cell.wrap_functional import load_ops_functions +from msprobe.mindspore.common.utils import convert_bf16_to_fp32 +from msprobe.mindspore.common.log import logger +from msprobe.mindspore.dump.hook_cell.api_registry import api_register + + +class MindsporeDataProcessor(BaseDataProcessor): + mindspore_special_type = tuple([ms.Tensor]) + ops_func, mint_ops_func, _ = load_ops_functions() + + def __init__(self, config, data_writer): + super().__init__(config, data_writer) + self.mindspore_object_key = { + "dtype": self.analyze_dtype_in_kwargs + } + + @staticmethod + def get_md5_for_tensor(x): + x = convert_bf16_to_fp32(x) + tensor_bytes = x.asnumpy().tobytes() + crc32_hash = zlib.crc32(tensor_bytes) + return f"{crc32_hash:08x}" + + @staticmethod + def analyze_dtype_in_kwargs(element): + return {"type": "mindspore.dtype", "value": str(element)} + + @staticmethod + def _analyze_builtin(arg): + single_arg = {} + if isinstance(arg, slice): + single_arg.update({"type": "slice"}) + # slice参数中可能存在tensor类型,json序列化,需要转换为python数值类型 + values = [ + value if not isinstance(value, ms.Tensor) else value.item() + for value in [arg.start, arg.stop, arg.step] + ] + single_arg.update({"value": values}) + else: + single_arg.update({"type": type(arg).__name__}) + single_arg.update({"value": arg}) + return single_arg + + @classmethod + def get_special_types(cls): + return super().get_special_types() + cls.mindspore_special_type + + def get_stat_info(self, data): + tensor_stat = TensorStatInfo() + if data.numel() == 0: + return tensor_stat + elif data.dtype == ms.bool_: + tensor_stat.max = self.mint_ops_func["max"](data).item() + tensor_stat.min = self.mint_ops_func["min"](data).item() + elif not data.shape: + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.item() + elif data.dtype == ms.complex64 or data.dtype == ms.complex128: + data_abs = np.abs(data.asnumpy()) + tensor_stat.max = np.max(data_abs) + tensor_stat.min = np.min(data_abs) + tensor_stat.mean = np.mean(data_abs) + tensor_stat.norm = np.linalg.norm(data_abs) + else: + if data.dtype == ms.bfloat16 or not ops.is_floating_point(data): + data = data.to(ms.float32) + api_register.norm_inner_op_set_ori_func() + tensor_stat.max = self.mint_ops_func["max"](data).item() + tensor_stat.min = self.mint_ops_func["min"](data).item() + tensor_stat.mean = self.mint_ops_func["mean"](data).item() + tensor_stat.norm = self.ops_func["norm"](data).item() + api_register.norm_inner_op_set_hook_func() + return tensor_stat + + def analyze_single_element(self, element, suffix_stack): + if suffix_stack and suffix_stack[-1] in self.mindspore_object_key: + return self.mindspore_object_key[suffix_stack[-1]](element) + + converted_numpy, numpy_type = self._convert_numpy_to_builtin(element) + if converted_numpy is not element: + return self._analyze_numpy(converted_numpy, numpy_type) + if isinstance(element, ms.Tensor): + return self._analyze_tensor(element, Const.SEP.join(suffix_stack)) + + if isinstance(element, (bool, int, float, str, slice)): + return self._analyze_builtin(element) + return {} + + def analyze_element(self, element): + return self.recursive_apply_transform(element, self.analyze_single_element) + + def _analyze_tensor(self, tensor, suffix): + tensor_stat = self.get_stat_info(tensor) + tensor_json = { + 'type': 'mindspore.Tensor', + 'dtype': str(tensor.dtype), + 'shape': tensor.shape, + 'Max': tensor_stat.max, + 'Min': tensor_stat.min, + 'Mean': tensor_stat.mean, + 'Norm': tensor_stat.norm + } + if self.config.summary_mode == Const.MD5: + tensor_md5 = self.get_md5_for_tensor(tensor) + tensor_json.update({Const.MD5: tensor_md5}) + return tensor_json + + +class StatisticsDataProcessor(MindsporeDataProcessor): + pass + + +class TensorDataProcessor(MindsporeDataProcessor): + def _analyze_tensor(self, tensor, suffix): + dump_data_name, file_path = self.get_save_file_path(suffix) + single_arg = super()._analyze_tensor(tensor, suffix) + single_arg.update({"data_name": dump_data_name}) + if not path_len_exceeds_limit(file_path): + tensor = convert_bf16_to_fp32(tensor) + saved_tensor = tensor.asnumpy() + np.save(file_path, saved_tensor) + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) + else: + logger.warning(f'The file path {file_path} length exceeds limit.') + return single_arg + + +class OverflowCheckDataProcessor(MindsporeDataProcessor): + __slots__ = ["cached_tensors_and_file_paths"] + + def __init__(self, config, data_writer): + super().__init__(config, data_writer) + self.cached_tensors_and_file_paths = {} + self.real_overflow_dump_times = 0 + self.overflow_nums = config.overflow_nums + + def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): + self.has_overflow = False + api_info_struct = super().analyze_forward(name, module, module_input_output) + self.maybe_save_overflow_data() + return api_info_struct if self.has_overflow else None + + def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs): + self.has_overflow = False + api_info_struct = super().analyze_backward(name, module, module_input_output) + self.maybe_save_overflow_data() + return api_info_struct if self.has_overflow else None + + def maybe_save_overflow_data(self): + if self.has_overflow: + for file_path, tensor in self.cached_tensors_and_file_paths.items(): + tensor = convert_bf16_to_fp32(tensor) + np.save(file_path, tensor.asnumpy()) + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) + self.real_overflow_dump_times += 1 + self.cached_tensors_and_file_paths = {} + + def stop_run(self): + if self.overflow_nums == -1: + return False + if self.real_overflow_dump_times >= self.overflow_nums: + logger.warning(f"[msprobe] 超过预设溢出次数 当前溢出次数: {self.real_overflow_dump_times}") + return True + return False + + def _analyze_maybe_overflow_tensor(self, tensor_json): + if tensor_json['Max'] is None: + return + if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']): + self.has_overflow = True + if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']): + self.has_overflow = True + + def _analyze_tensor(self, tensor, suffix): + dump_data_name, file_path = self.get_save_file_path(suffix) + if not path_len_exceeds_limit(file_path): + self.cached_tensors_and_file_paths.update({file_path: tensor}) + else: + logger.warning(f'The file path {file_path} length exceeds limit.') + single_arg = super()._analyze_tensor(tensor, suffix) + self._analyze_maybe_overflow_tensor(single_arg) + single_arg.update({"data_name": dump_data_name}) + return single_arg diff --git a/debug/accuracy_tools/atat/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py similarity index 87% rename from debug/accuracy_tools/atat/core/data_dump/data_processor/pytorch_processor.py rename to debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index cf3c5ebe586..007fec80964 100644 --- a/debug/accuracy_tools/atat/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -5,18 +5,19 @@ from typing import List import numpy as np import torch -from atat.core.common.exceptions import MsaccException -from atat.core.common.file_check import path_len_exceeds_limit, change_mode -from atat.core.common.log import logger -from atat.core.common.const import Const, OverflowConst, FileCheckConst -from atat.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \ +from msprobe.core.common.exceptions import MsprobeException +from msprobe.core.common.file_check import path_len_exceeds_limit, change_mode +from msprobe.core.common.log import logger +from msprobe.core.common.const import Const, OverflowConst, FileCheckConst +from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \ ModuleForwardInputsOutputs, TensorStatInfo -from atat.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow +from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow try: import torch_npu + is_gpu = False except ImportError: - pass + is_gpu = True class PytorchDataProcessor(BaseDataProcessor): @@ -76,6 +77,42 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_stat.mean = torch._C._VariableFunctionsClass.mean(data_clone).item() tensor_stat.norm = torch._C._VariableFunctionsClass.norm(data_clone).item() return tensor_stat + + @staticmethod + def handle_tensor_extremum_nan_inf(tensor, operator): + data_clone = tensor.detach() + data_nan = torch._C._VariableFunctionsClass.isnan(data_clone) + if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel(): + return float('nan') + finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone) + if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0: + finite_values = data_clone[finite_mask] + return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(finite_values).item() + else: + data_no_nan = data_clone[~data_nan] + return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(data_no_nan).item() + + @staticmethod + def _analyze_builtin(arg): + single_arg = {} + if isinstance(arg, slice): + single_arg.update({"type": "slice"}) + # slice参数中可能存在tensor类型,json序列化,需要转换为python数值类型 + values = [ + value if not isinstance(value, torch.Tensor) else value.item() + for value in [arg.start, arg.stop, arg.step] + ] + single_arg.update({"value": values}) + else: + single_arg.update({"type": type(arg).__name__}) + single_arg.update({"value": arg}) + return single_arg + + @staticmethod + def _analyze_torch_size(arg): + return {"type": "torch.Size", "value": list(arg)} @classmethod def get_special_types(cls): @@ -93,14 +130,11 @@ class PytorchDataProcessor(BaseDataProcessor): return self._analyze_tensor(element, Const.SEP.join(suffix_stack)) if isinstance(element, (bool, int, float, str, slice)): return self._analyze_builtin(element) - return None + return {} def analyze_element(self, element): return self.recursive_apply_transform(element, self.analyze_single_element) - def _analyze_torch_size(arg): - return {"type": "torch.Size", "value": list(arg)} - def _analyze_tensor(self, tensor, suffix): tensor_stat = self.get_stat_info(tensor) tensor_json = {} @@ -112,9 +146,17 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_json.update({"Mean": tensor_stat.mean}) tensor_json.update({"Norm": tensor_stat.norm}) tensor_json.update({"requires_grad": tensor.requires_grad}) - if self.config.summary_mode == "md5": + + if tensor_stat.max is not None: + if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max): + tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max") + if tensor_stat.min is not None: + if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min): + tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min") + + if self.config.summary_mode == Const.MD5: tensor_md5 = self.get_md5_for_tensor(tensor) - tensor_json.update({"md5": tensor_md5}) + tensor_json.update({Const.MD5: tensor_md5}) return tensor_json @@ -142,7 +184,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): super().__init__(config, data_writer) self.cached_tensors_and_file_paths = {} self.real_overflow_dump_times = 0 - self.overflow_nums = config.overflow_num + self.overflow_nums = config.overflow_nums self.bits_for_overflow = 8 @staticmethod @@ -150,21 +192,6 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE) return overflow_mode == Const.ENV_ENABLE - @staticmethod - def handle_tensor_extremum_nan_inf(data_clone, operator): - data_nan = torch._C._VariableFunctionsClass.isnan(data_clone) - if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel(): - return float('nan') - finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone) - if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0: - finite_values = data_clone[finite_mask] - return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \ - torch._C._VariableFunctionsClass.min(finite_values).item() - else: - data_no_nan = data_clone[~data_nan] - return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \ - torch._C._VariableFunctionsClass.min(data_no_nan).item() - def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): self.has_overflow = False api_info_struct = super().analyze_forward(name, module, module_input_output) @@ -190,7 +217,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): if self.overflow_nums == -1: return if self.real_overflow_dump_times >= self.overflow_nums: - raise MsaccException(MsaccException.OVERFLOW_NUMS_ERROR, str(self.real_overflow_dump_times)) + raise MsprobeException(MsprobeException.OVERFLOW_NUMS_ERROR, str(self.real_overflow_dump_times)) def check_overflow_npu(self): if self.overflow_debug_mode_enalbe(): @@ -210,16 +237,13 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): else: torch_npu._C._clear_overflow_npu() - def _analyze_maybe_overflow_tensor(self, tensor_json, tensor): - data_clone = tensor.detach() - if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan(): + def _analyze_maybe_overflow_tensor(self, tensor_json): + if is_gpu or (hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan()): if tensor_json['Max'] is None: return if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']): - tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "max") self.has_overflow = True if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']): - tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "min") self.has_overflow = True else: self.has_overflow = self.check_overflow_npu() @@ -233,7 +257,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): else: logger.warning(f'The file path {file_path} length exceeds limit.') single_arg = super()._analyze_tensor(tensor, suffix) - self._analyze_maybe_overflow_tensor(single_arg, tensor) + self._analyze_maybe_overflow_tensor(single_arg) single_arg.update({"data_name": dump_data_name}) return single_arg diff --git a/debug/accuracy_tools/atat/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py similarity index 85% rename from debug/accuracy_tools/atat/core/data_dump/json_writer.py rename to debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 23f37b2342e..112e45171ef 100644 --- a/debug/accuracy_tools/atat/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -4,9 +4,9 @@ import fcntl import json from pathlib import Path -from atat.core.common.file_check import change_mode -from atat.core.common.log import logger -from atat.core.common.const import Const, FileCheckConst +from msprobe.core.common.file_check import change_mode, FileOpen +from msprobe.core.common.log import logger +from msprobe.core.common.const import Const, FileCheckConst class DataWriter: @@ -30,20 +30,20 @@ class DataWriter: return is_exists = os.path.exists(file_path) append = "a+" if is_exists else "w+" - with os.fdopen( - os.open(file_path, Const.WRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), append, newline="" - ) as csv_file: + with FileOpen(file_path, append) as csv_file: spawn_writer = csv.writer(csv_file) if not is_exists: spawn_writer.writerow(result_header) spawn_writer.writerows([result,]) + is_new_file = not is_exists + if is_new_file: + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) def initialize_json_file(self, **kwargs): kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}}) - with os.fdopen( - os.open(self.dump_file_path, Const.OVERWRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), 'w' - ) as f: + with FileOpen(self.dump_file_path, 'w') as f: json.dump(kwargs, f) + change_mode(self.dump_file_path, FileCheckConst.DATA_FILE_AUTHORITY) if os.path.exists(self.stack_file_path): os.remove(self.stack_file_path) @@ -83,7 +83,7 @@ class DataWriter: def write_data_json(self, file_path): logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ") if Path(file_path).exists() and os.path.getsize(file_path) > 0: - with open(file_path, "r+") as f: + with FileOpen(file_path, "r+") as f: fcntl.flock(f, fcntl.LOCK_EX) data_to_write = json.load(f) fcntl.flock(f, fcntl.LOCK_UN) @@ -91,7 +91,7 @@ class DataWriter: self.init_json['data_path'] = self.dump_tensor_data_dir data_to_write = self.init_json data_to_write[Const.DATA].update(self.cache_data[Const.DATA]) - with open(file_path, 'w+') as f: + with FileOpen(file_path, 'w+') as f: fcntl.flock(f, fcntl.LOCK_EX) json.dump(data_to_write, f, indent=1) fcntl.flock(f, fcntl.LOCK_UN) @@ -99,13 +99,13 @@ class DataWriter: self.cache_data[Const.DATA].clear() def write_stack_info_json(self, file_path): - with open(file_path, 'w+') as f: + with FileOpen(file_path, 'w+') as f: fcntl.flock(f, fcntl.LOCK_EX) json.dump(self.cache_stack, f, indent=1) fcntl.flock(f, fcntl.LOCK_UN) def write_construct_info_json(self, file_path): - with open(file_path, 'w+') as f: + with FileOpen(file_path, 'w+') as f: fcntl.flock(f, fcntl.LOCK_EX) json.dump(self.cache_construct, f, indent=1) fcntl.flock(f, fcntl.LOCK_UN) diff --git a/debug/accuracy_tools/atat/core/data_dump/scope.py b/debug/accuracy_tools/msprobe/core/data_dump/scope.py similarity index 98% rename from debug/accuracy_tools/atat/core/data_dump/scope.py rename to debug/accuracy_tools/msprobe/core/data_dump/scope.py index e7114f343fe..1d74c3e461a 100644 --- a/debug/accuracy_tools/atat/core/data_dump/scope.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/scope.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from atat.core.common.exceptions import ScopeException -from atat.core.common.const import Const +from msprobe.core.common.exceptions import ScopeException +from msprobe.core.common.const import Const def build_scope(scope_class, scope=None, api_list=None): diff --git a/debug/accuracy_tools/msprobe/mindspore/__init__.py b/debug/accuracy_tools/msprobe/mindspore/__init__.py new file mode 100644 index 00000000000..3bf42d1e39f --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/__init__.py @@ -0,0 +1 @@ +from msprobe.mindspore.debugger.precision_debugger import PrecisionDebugger diff --git a/debug/accuracy_tools/msprobe/mindspore/common/log.py b/debug/accuracy_tools/msprobe/mindspore/common/log.py new file mode 100644 index 00000000000..ec027c75013 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/common/log.py @@ -0,0 +1,38 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import time +import sys + +from msprobe.mindspore.common.utils import get_rank_if_initialized +from msprobe.core.common.log import BaseLogger +from msprobe.core.common.exceptions import DistributedNotInitializedError + + +class MindsporeLogger(BaseLogger): + def __init__(self): + super().__init__() + + def get_rank(self): + try: + current_rank = get_rank_if_initialized() + except DistributedNotInitializedError: + current_rank = None + + return current_rank + + +logger = MindsporeLogger() \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py new file mode 100644 index 00000000000..6abf0a1ee88 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py @@ -0,0 +1,44 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import mindspore as ms +from msprobe.core.common.exceptions import DistributedNotInitializedError + + +def get_rank_if_initialized(): + if ms.communication.GlobalComm.INITED: + return ms.communication.get_rank() + else: + raise DistributedNotInitializedError("mindspore distributed environment is not initialized") + + +def convert_bf16_to_fp32(tensor): + if tensor.dtype == ms.bfloat16: + tensor = tensor.to(ms.float32) + return tensor + + +class MsprobeStep(ms.train.Callback): + + def __init__(self, debugger): + super(MsprobeStep, self).__init__() + self.debugger = debugger + + def on_train_step_begin(self, run_context): + self.debugger.start() + + def on_train_step_end(self, run_context): + self.debugger.stop() + self.debugger.step() diff --git a/debug/accuracy_tools/atat/mindspore/debugger/__init__.py b/debug/accuracy_tools/msprobe/mindspore/debugger/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/mindspore/debugger/__init__.py rename to debug/accuracy_tools/msprobe/mindspore/debugger/__init__.py diff --git a/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py similarity index 69% rename from debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py rename to debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py index 56a4b9bf758..23cb7294b8d 100644 --- a/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py @@ -1,13 +1,10 @@ import os +from msprobe.core.common.utils import Const +from msprobe.core.common.const import MsConst -class DebuggerConfig: - convert_map = { - "L0": "cell", - "L1": "api", - "L2": 'kernel' - } +class DebuggerConfig: def __init__(self, common_config, task_config): self.dump_path = common_config.dump_path self.task = common_config.task @@ -15,18 +12,22 @@ class DebuggerConfig: self.step = [] if not common_config.step else common_config.step if not common_config.level: common_config.level = "L1" - self.level = DebuggerConfig.convert_map[common_config.level] + self.level = MsConst.TOOL_LEVEL_DICT.get(common_config.level, MsConst.API) + self.level_ori = common_config.level self.list = [] if not task_config.list else task_config.list - self.data_mode = [] if not task_config.data_mode else task_config.data_mode + self.scope = [] if not task_config.scope else task_config.scope + self.data_mode = [] if not task_config.data_mode else task_config.data_mode self.file_format = task_config.file_format + self.overflow_nums = 1 if not task_config.overflow_nums else task_config.overflow_nums self.check_mode = task_config.check_mode - + self.framework = Const.MS_FRAMEWORK + self.summary_mode = task_config.summary_mode self.check() def check(self): if not self.dump_path: raise Exception("Dump path is empty.") - if not os.path.isabs(self.dump_path): + if self.level_ori != "L1" and not os.path.isabs(self.dump_path): raise Exception("Dump path must be absolute path.") if not self.task: self.task = "statistics" diff --git a/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py similarity index 31% rename from debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py rename to debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py index 0099074762f..5475dc3586c 100644 --- a/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py @@ -1,7 +1,12 @@ import os -from atat.mindspore.ms_config import parse_json_config -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.task_handler_factory import TaskHandlerFactory + +import mindspore as ms + +from msprobe.mindspore.service import Service +from msprobe.mindspore.ms_config import parse_json_config +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.task_handler_factory import TaskHandlerFactory +from msprobe.core.common.const import MsConst class PrecisionDebugger: @@ -12,6 +17,8 @@ class PrecisionDebugger: cls._instance = super().__new__(cls) cls._instance.initialized = False cls._instance.config = None + cls.service = None + cls.first_start = False return cls._instance def __init__(self, config_path=None): @@ -23,10 +30,46 @@ class PrecisionDebugger: self.config = DebuggerConfig(common_config, task_config) self.initialized = True + @staticmethod + def _get_execution_mode(): + if ms.get_context("mode") == ms.GRAPH_MODE: + if ms.context.get_jit_config().get("jit_level") == "O2" or ms.get_context("jit_level") == "O2": + return MsConst.GRAPH_GE_MODE + else: + return MsConst.GRAPH_KBYK_MODE + else: + return MsConst.PYNATIVE_MODE + @classmethod - def start(cls, target=None): + def start(cls): instance = cls._instance if not instance: raise Exception("No instance of PrecisionDebugger found.") - handler = TaskHandlerFactory.create(instance.config) - handler.handle() + + instance.config.execution_mode = instance._get_execution_mode() + if instance.config.execution_mode == MsConst.PYNATIVE_MODE and instance.config.level == MsConst.API: + if not instance.service: + instance.service = Service(instance.config) + instance.service.start() + else: + if not instance.first_start: + handler = TaskHandlerFactory.create(instance.config) + handler.handle() + + instance.first_start = True + + @classmethod + def stop(cls): + instance = cls._instance + if not instance: + raise Exception("PrecisionDebugger instance is not created.") + if instance.service: + instance.service.stop() + + @classmethod + def step(cls): + instance = cls._instance + if not instance: + raise Exception("PrecisionDebugger instance is not created.") + if instance.service: + instance.service.step() diff --git a/debug/accuracy_tools/atat/mindspore/doc/dump.md b/debug/accuracy_tools/msprobe/mindspore/doc/dump.md similarity index 81% rename from debug/accuracy_tools/atat/mindspore/doc/dump.md rename to debug/accuracy_tools/msprobe/mindspore/doc/dump.md index 3321a4da12b..425d0683a26 100644 --- a/debug/accuracy_tools/atat/mindspore/doc/dump.md +++ b/debug/accuracy_tools/msprobe/mindspore/doc/dump.md @@ -1,8 +1,8 @@ # **精度数据采集** -atat工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。 +msprobe工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。 -执行dump操作需要安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 +执行dump操作需要安装msprobe工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 ## dump接口介绍 @@ -12,7 +12,7 @@ atat工具主要通过在训练脚本内添加dump接口并启动训练的方式 通过加载dump配置文件的方式来确定dump操作的详细配置。 -可以在from atat.mindspore import PrecisionDebugger和模型初始化之间的任意位置添加该接口。 +可以在from msprobe.mindspore import PrecisionDebugger和模型初始化之间的任意位置添加该接口。 **原型** @@ -43,7 +43,7 @@ debugger.start() ## 示例代码 ```Python -from atat.mindspore import PrecisionDebugger +from msprobe.mindspore import PrecisionDebugger debugger = PrecisionDebugger(config_path="./config.json") # 请勿将以上初始化流程插入到循环代码中 # 下面代码也可以用PrecisionDebugger.start() diff --git a/debug/accuracy_tools/atat/mindspore/dump/__init__.py b/debug/accuracy_tools/msprobe/mindspore/dump/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/mindspore/dump/__init__.py rename to debug/accuracy_tools/msprobe/mindspore/dump/__init__.py diff --git a/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py b/debug/accuracy_tools/msprobe/mindspore/dump/api_kbk_dump.py similarity index 91% rename from debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py rename to debug/accuracy_tools/msprobe/mindspore/dump/api_kbk_dump.py index a53841189f5..5c7af45d790 100644 --- a/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py +++ b/debug/accuracy_tools/msprobe/mindspore/dump/api_kbk_dump.py @@ -1,9 +1,9 @@ import os import json -from atat.core.common.utils import make_dump_path_if_not_exists -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.core.common.log import logger -from atat.core.common.file_check import FileOpen +from msprobe.core.common.utils import make_dump_path_if_not_exists +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.core.common.log import logger +from msprobe.core.common.file_check import FileOpen class ApiKbkDump: diff --git a/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py similarity index 82% rename from debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py rename to debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py index ab534edc243..2c4579b0e75 100644 --- a/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py @@ -1,6 +1,6 @@ -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.dump.api_kbk_dump import ApiKbkDump -from atat.mindspore.dump.kernel_graph_dump import KernelGraphDump +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.dump.api_kbk_dump import ApiKbkDump +from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump class DumpToolFactory: diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/api_registry.py b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/api_registry.py new file mode 100644 index 00000000000..5508416fde0 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/api_registry.py @@ -0,0 +1,104 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import mindspore as ms +from msprobe.mindspore.dump.hook_cell.wrap_functional import get_functional_ops, setup_hooks, \ + HOOKFunctionalOP, HOOKMintOP, HOOKMintNNFunctionalOP +from msprobe.mindspore.dump.hook_cell.wrap_tensor import get_tensor_ops, wrap_tensor_ops_and_bind, HOOKTensor +from msprobe.core.common.utils import Const + + +class ApiRegistry: + def __init__(self): + self.tensor_ori_attr = {} + self.functional_ori_attr = {} + self.mint_ops_ori_attr = {} + self.mint_func_ops_ori_attr = {} + self.norm_inner_ops_ori_attr = {} + + self.tensor_hook_attr = {} + self.functional_hook_attr = {} + self.mint_ops_hook_attr = {} + self.mint_func_ops_hook_attr = {} + self.norm_inner_ops_hook_attr = {} + + self.norm_inner_ops = ["norm", "square", "sqrt", "is_complex"] + + @staticmethod + def store_ori_attr(ori_api_group, api_list, api_ori_attr): + for api in api_list: + if Const.SEP in api: + sub_module_name, sub_op = api.rsplit(Const.SEP, 1) + sub_module = getattr(ori_api_group, sub_module_name) + api_ori_attr[api] = getattr(sub_module, sub_op) + else: + api_ori_attr[api] = getattr(ori_api_group, api) + + @staticmethod + def set_api_attr(api_group, attr_dict): + for api, api_attr in attr_dict.items(): + if Const.SEP in api: + sub_module_name, sub_op = api.rsplit(Const.SEP, 1) + sub_module = getattr(api_group, sub_module_name, None) + if sub_module is not None: + setattr(sub_module, sub_op, api_attr) + else: + setattr(api_group, api, api_attr) + + def norm_inner_op_set_hook_func(self): + self.set_api_attr(ms.ops, self.norm_inner_ops_hook_attr) + + def norm_inner_op_set_ori_func(self): + self.set_api_attr(ms.ops, self.norm_inner_ops_ori_attr) + + def api_set_hook_func(self): + self.set_api_attr(ms.Tensor, self.tensor_hook_attr) + self.set_api_attr(ms.ops, self.functional_hook_attr) + self.set_api_attr(ms.mint, self.mint_ops_hook_attr) + self.set_api_attr(ms.mint.nn.functional, self.mint_func_ops_hook_attr) + + def api_set_ori_func(self): + self.set_api_attr(ms.Tensor, self.tensor_ori_attr) + self.set_api_attr(ms.ops, self.functional_ori_attr) + self.set_api_attr(ms.mint, self.mint_ops_ori_attr) + self.set_api_attr(ms.mint.nn.functional, self.mint_func_ops_ori_attr) + + def initialize_hook(self, hook): + self.store_ori_attr(ms.Tensor, get_tensor_ops(), self.tensor_ori_attr) + wrap_tensor_ops_and_bind(hook) + for attr_name in dir(HOOKTensor): + if attr_name.startswith(Const.ATTR_NAME_PREFIX): + self.tensor_hook_attr[attr_name[Const.ATTR_NAME_PREFIX_LEN:]] = getattr(HOOKTensor, attr_name) + + functional_ops, mint_ops, mint_func_ops = get_functional_ops() + self.store_ori_attr(ms.ops, self.norm_inner_ops, self.norm_inner_ops_ori_attr) + self.store_ori_attr(ms.ops, functional_ops, self.functional_ori_attr) + self.store_ori_attr(ms.mint, mint_ops, self.mint_ops_ori_attr) + self.store_ori_attr(ms.mint.nn.functional, mint_func_ops, self.mint_func_ops_ori_attr) + setup_hooks(hook) + for attr_name in dir(HOOKFunctionalOP): + if attr_name.startswith(Const.ATTR_NAME_PREFIX): + self.functional_hook_attr[attr_name[Const.ATTR_NAME_PREFIX_LEN:]] = getattr(HOOKFunctionalOP, attr_name) + if attr_name[Const.ATTR_NAME_PREFIX_LEN:] in self.norm_inner_ops: + self.norm_inner_ops_hook_attr[attr_name[Const.ATTR_NAME_PREFIX_LEN:]] = getattr(HOOKFunctionalOP, attr_name) + for attr_name in dir(HOOKMintOP): + if attr_name.startswith(Const.ATTR_NAME_PREFIX): + self.mint_ops_hook_attr[attr_name[Const.ATTR_NAME_PREFIX_LEN:]] = getattr(HOOKMintOP, attr_name) + for attr_name in dir(HOOKMintNNFunctionalOP): + if attr_name.startswith(Const.ATTR_NAME_PREFIX): + self.mint_func_ops_hook_attr[attr_name[Const.ATTR_NAME_PREFIX_LEN:]] = getattr(HOOKMintNNFunctionalOP, attr_name) + + +api_register = ApiRegistry() diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/hook_cell.py b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/hook_cell.py new file mode 100644 index 00000000000..57ed44111ca --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/hook_cell.py @@ -0,0 +1,53 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from collections import defaultdict + +from mindspore import nn +from msprobe.core.common.const import Const + + +class HOOKCell(nn.Cell): + cell_count = defaultdict(int) + g_stop_hook = False + + def __init__(self, build_hook) -> None: + super(HOOKCell, self).__init__() + self.changed_status = False + self.input_kwargs = {} + self.prefix = "" + if not HOOKCell.g_stop_hook: + HOOKCell.g_stop_hook = True + self.changed_status = True + if hasattr(self, "prefix_op_name_"): + self.prefix = self.prefix_op_name_ + + HOOKCell.cell_count[self.prefix] += 1 + self.prefix = self.prefix + str(HOOKCell.cell_count[self.prefix] - 1) + Const.SEP + forward_hook, backward_hook = build_hook(self.prefix) + self.register_forward_hook(forward_hook) + self.register_backward_hook(backward_hook) + + # 重载call,加全局标志。 + def __call__(self, *args, **kwargs): + try: + self.input_kwargs = kwargs + out = super(HOOKCell, self).__call__(*args, **kwargs) + except Exception as e: + raise e + finally: + if self.changed_status: + self.changed_status = False + HOOKCell.g_stop_hook = False + return out diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml new file mode 100644 index 00000000000..089f444b618 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml @@ -0,0 +1,925 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# List of ops that register hooks + + +ops: + - adaptive_avg_pool1d + - adaptive_avg_pool2d + - adaptive_avg_pool3d + - adaptive_max_pool1d + - adaptive_max_pool2d + - avg_pool1d + - avg_pool2d + - avg_pool3d + - batch_norm + - bias_add + - ctc_greedy_decoder + - conv1d + - conv2d + - conv3d + - deformable_conv2d + - dense + - dropout + - dropout1d + - dropout2d + - dropout3d + - flatten + - fold + - fractional_max_pool3d + - lp_pool1d + - lp_pool2d + - lrn + - max_pool2d + - max_pool3d + - max_unpool1d + - max_unpool2d + - max_unpool3d + - unfold + - binary_cross_entropy + - binary_cross_entropy_with_logits + - cosine_embedding_loss + - cross_entropy + - ctc_loss + - gaussian_nll_loss + - hinge_embedding_loss + - huber_loss + - kl_div + - l1_loss + - margin_ranking_loss + - mse_loss + - multi_margin_loss + - multilabel_margin_loss + - multilabel_soft_margin_loss + - nll_loss + - smooth_l1_loss + - triplet_margin_loss + - elu + - fast_gelu + - gelu + - glu + - gumbel_softmax + - hardshrink + - hardsigmoid + - hardswish + - hardtanh + - leaky_relu + - log_softmax + - logsigmoid + - mish + - prelu + - relu + - relu6 + - celu + - rrelu + - selu + - sigmoid + - silu + - softmax + - softmin + - softshrink + - softsign + - tanh + - threshold + - cdist + - dist + - pdist + - choice_with_mask + - random_categorical + - log_uniform_candidate_sampler + - uniform_candidate_sampler + - affine_grid + - bounding_box_decode + - bounding_box_encode + - col2im + - check_valid + - crop_and_resize + - grid_sample + - interpolate + - iou + - pad + - padding + - pixel_shuffle + - pixel_unshuffle + - upsample + - abs + - absolute + - accumulate_n + - acos + - arccos + - acosh + - add + - addcdiv + - addcmul + - addmv + - addn + - angle + - arccosh + - arcsin + - arcsinh + - arctan + - arctanh + - arctan2 + - asin + - asinh + - atan + - atan2 + - atanh + - atleast_1d + - atleast_2d + - atleast_3d + - bessel_i0 + - bessel_i0e + - bessel_i1 + - bessel_i1e + - bessel_j0 + - bessel_j1 + - bessel_k0 + - bessel_k0e + - bessel_k1 + - bessel_k1e + - bessel_y0 + - bessel_y1 + - bitwise_and + - bitwise_left_shift + - bitwise_or + - bitwise_right_shift + - bitwise_xor + - ceil + - clamp + - clip + - combinations + - copysign + - cos + - cosh + - cosine_similarity + - cov + - diag_embed + - diff + - deg2rad + - digamma + - div + - divide + - erf + - erfc + - erfinv + - exp + - exp2 + - expm1 + - floor + - floor_div + - floor_mod + - float_power + - fmod + - frac + - gcd + - hypot + - igamma + - igammac + - imag + - i0 + - inv + - invert + - lcm + - ldexp + - lerp + - log + - log2 + - log10 + - log1p + - logaddexp + - logaddexp2 + - logical_and + - logical_not + - logical_or + - logical_xor + - logit + - mul + - multiply + - mvlgamma + - neg + - negative + - nextafter + - polar + - polygamma + - positive + - pow + - rad2deg + - ravel + - real + - reciprocal + - remainder + - rot90 + - round + - rsqrt + - sgn + - sign + - signbit + - sin + - sinc + - sinh + - sqrt + - square + - sub + - subtract + - t + - tan + - tanhshrink + - trapz + - tril_indices + - triu_indices + - true_divide + - trunc + - truncate_div + - truncate_mod + - xdivy + - xlogy + - zeta + - all + - amax + - amin + - aminmax + - any + - argmax + - argmin + - cummax + - cummin + - cumprod + - cumsum + - fmax + - histc + - logsumexp + - max + - mean + - median + - min + - norm + - prod + - std + - std_mean + - var + - var_mean + - argsort + - approximate_equal + - equal + - ge + - greater + - greater_equal + - gt + - intopk + - isclose + - isfinite + - isinf + - isnan + - isneginf + - isposinf + - isreal + - is_complex + - le + - less + - less_equal + - lt + - maximum + - minimum + - msort + - ne + - not_equal + - searchsorted + - topk + - bmm + - addbmm + - addmm + - baddbmm + - addr + - adjoint + - cholesky + - cholesky_solve + - batch_dot + - dot + - eig + - inner + - inverse + - geqrf + - ger + - kron + - lu_solve + - lu_unpack + - matmul + - matrix_solve + - matrix_band_part + - matrix_diag + - matrix_diag_part + - matrix_set_diag + - mm + - mv + - outer + - orgqr + - ormqr + - pinv + - svd + - tensor_dot + - logdet + - slogdet + - qr + - trace + - bartlett_window + - blackman_window + - hamming_window + - hann_window + - kaiser_window + - eye + - fill + - full + - full_like + - linspace + - logspace + - one_hot + - arange + - range + - heaviside + - bernoulli + - gamma + - laplace + - multinomial + - multinomial_with_replacement + - rand + - rand_like + - randint + - randint_like + - randn + - randn_like + - random_gamma + - random_poisson + - randperm + - standard_laplace + - standard_normal + - uniform + - argwhere + - batch_to_space_nd + - bincount + - block_diag + - broadcast_to + - cat + - channel_shuffle + - chunk + - column_stack + - concat + - conj + - count_nonzero + - deepcopy + - diag + - diagflat + - diagonal + - dyn_shape + - dsplit + - dstack + - einsum + - expand + - expand_dims + - flip + - fliplr + - flipud + - gather_d + - gather_elements + - gather_nd + - hsplit + - hstack + - index_add + - index_fill + - index_select + - inplace_add + - inplace_index_add + - inplace_sub + - inplace_update + - masked_fill + - masked_select + - meshgrid + - moveaxis + - movedim + - narrow + - nan_to_num + - nansum + - normal + - nonzero + - population_count + - rank + - repeat_elements + - repeat_interleave + - reshape + - reverse + - reverse_sequence + - roll + - scatter + - scatter_nd + - select + - sequence_mask + - shuffle + - size + - slice + - sort + - space_to_batch_nd + - sparse_segment_mean + - split + - squeeze + - stack + - strided_slice + - sum + - swapaxes + - swapdims + - tensor_scatter_add + - tensor_scatter_div + - tensor_scatter_max + - tensor_scatter_min + - tensor_scatter_mul + - tensor_scatter_sub + - tensor_scatter_elements + - tensor_split + - tile + - tril + - triu + - transpose + - unbind + - unique + - unique_consecutive + - unique_with_pad + - unsorted_segment_max + - unsorted_segment_min + - unsorted_segment_prod + - unsorted_segment_sum + - unsqueeze + - unstack + - view_as_real + - vsplit + - vstack + - where + - cross + - renorm + - is_tensor + - scalar_cast + - scalar_to_tensor + - tuple_to_array + - clip_by_global_norm + - clip_by_value + - assign + - assign_add + - assign_sub + - scatter_add + - scatter_div + - scatter_max + - scatter_min + - scatter_mul + - scatter_nd_add + - scatter_nd_div + - scatter_nd_max + - scatter_nd_min + - scatter_nd_mul + - scatter_nd_sub + - scatter_update + - derivative + - jet + +tensor: + - __abs__ + - __add__ + - __and__ + - __bool__ + - __eq__ + - __ge__ + - __gt__ + - __iadd__ + - __ifloordiv__ + - __imatmul__ + - __imod__ + - __imul__ + - __isub__ + - __le__ + - __lt__ + - __matmul__ + - __mod__ + - __mul__ + - __ne__ + - __neg__ + - __or__ + - __pow__ + - __radd__ + - __rmatmul__ + - __rmod__ + - __rmul__ + - __rpow__ + - __rsub__ + - __sub__ + - __truediv__ + - __xor__ + - abs + - absolute + - acos + - acosh + - add + - addbmm + - addcdiv + - addcmul + - addmm + - addmv + - addr + - all + - amax + - amin + - any + - arccos + - arccosh + - argmax + - angle + - arcsin + - arcsinh + - arctan + - arctanh + - argmin + - argsort + - asin + - asinh + - atan + - atan2 + - atanh + - baddbmm + - bernoulli + - bincount + - bitwise_and + - bitwise_or + - bitwise_xor + - bmm + - bool + - broadcast_to + - ceil + - cholesky_solve + - cholesky + - clamp + - clip + - conj + - copysign + - cos + - cosh + - cross + - cummax + - cummin + - cumprod + - cumsum + - deg2rad + - diag + - diagflat + - diff + - digamma + - div + - divide + - equal + - erf + - erfc + - erfinv + - exp + - expand_as + - expm1 + - flip + - fliplr + - flipud + - float_power + - floor + - fmod + - frac + - gather_elements + - ge + - geqrf + - ger + - greater + - greater_equal + - gt + - half + - hardshrink + - heaviside + - histc + - hypot + - i0 + - igamma + - igammac + - imag + - index_add + - index_fill + - index_put + - index_select + - inner + - int + - inverse + - isclose + - isfinite + - isinf + - isnan + - is_complex + - is_signed + - isneginf + - isposinf + - isreal + - lcm + - ldexp + - le + - lerp + - less + - less_equal + - log + - log10 + - log1p + - log2 + - logaddexp + - logaddexp2 + - logdet + - logical_and + - logical_not + - logical_or + - logical_xor + - logit + - logsumexp + - long + - lt + - masked_fill + - masked_scatter + - masked_select + - matmul + - max + - maximum + - mean + - median + - min + - minimum + - moveaxis + - movedim + - msort + - multinomial + - multiply + - mvlgamma + - nan_to_num + - nansum + - narrow + - ne + - neg + - negative + - nelement + - new_ones + - new_zeros + - nextafter + - norm + - nonzero + - not_equal + - ormqr + - permute + - pow + - prod + - qr + - ravel + - real + - reciprocal + - remainder + - renorm + - rad2deg + - tile + - repeat_interleave + - reshape + - reshape + - round + - rot90 + - rsqrt + - sum_to_size + - scatter + - sgn + - short + - sigmoid + - sign + - signbit + - sin + - sinc + - sinh + - slogdet + - sort + - split + - sqrt + - square + - squeeze + - std + - subtract + - subtract + - svd + - swapaxes + - swapdims + - t + - take + - tan + - tanh + - trace + - swapaxes + - tile + - to + - topk + - tril + - tensor_split + - transpose + - true_divide + - trunc + - unbind + - unique_consecutive + - unsqueeze + - var + - view + - where + - xlogy + - from_numpy + - std + - take + - var + - all + - any + - copy + - diagonal + - flatten + - resize + - sum + +mint.ops: + - abs + - absolute_import + - add + - add_ex + - all + - any + - any_ex + - arange + - argmax + - avg_pool2d + - baddbmm + - baddbmm_ex + - batch_norm + - binary_cross_entropy_with_logits + - bitwise_and + - bitwise_or + - bitwise_xor + - bmm + - broadcast_to + - cat + - cat_ex + - ceil + - chunk + - clamp + - conv2d + - conv_transpose2d + - cos + - cross + - cummax + - cummin + - cumsum + - div + - divide + - dropout + - embedding + - eq + - erf + - erfinv + - exp + - flatten + - flip + - flip_ex + - fold + - full + - functional + - gather + - gelu + - greater + - grid_sample + - group_norm + - gt + - index_select + - interpolate + - isclose + - isfinite + - layer_norm + - le + - leaky_relu + - less + - less_equal + - linear + - linspace + - log + - logical_and + - logical_not + - logical_or + - lt + - masked_select + - matmul + - max + - max_pool2d + - maximum + - mean + - mean_ex + - min + - minimum + - mul + - ne + - neg + - negative + - nn + - nonzero + - normal + - one_hot + - ones + - ones_ex + - ones_like + - pad + - permute + - permute_ex + - pow + - prod + - reciprocal + - relu + - remainder + - repeat_interleave + - rsqrt + - scatter + - scatter_add + - searchsorted + - sigmoid + - silu + - sin + - softmax + - softplus + - sort + - split + - sqrt + - sqrt_ex + - square + - stack + - sub + - sub_ex + - sum + - tanh + - tile + - topk + - tril + - triu + - unfold + - unique + - where + - xlogy + - zeros + - zeros_ex + - zeros_like + +mint.nn: + - Dropout + - Embedding + - Fold + - LayerNorm + - Linear + - MaxPool2d + - Unfold + - Upsample + +mint.nn.functional: + - absolute_import + - avg_pool2d + - batch_norm + - batch_norm_ex + - bce_with_logits + - binary_cross_entropy_with_logits + - conv_transpose2d + - dense + - dropout + - embedding + - fold + - gelu + - grid_sample + - group_norm + - interpolate + - layer_norm + - leaky_relu + - linear + - max_pool2d + - max_pool2d_ex + - normal + - one_hot + - one_hot_ext + - pad + - relu + - sigmoid + - silu + - softmax + - softmax_ex + - softplus + - tanh + - unfold diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_functional.py b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_functional.py new file mode 100644 index 00000000000..be3d1bd2545 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_functional.py @@ -0,0 +1,94 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import yaml +import mindspore as ms +from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell +from msprobe.core.common.utils import Const +from msprobe.core.common.file_check import FileOpen + + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") + + +def load_ops_functions(): + ops_func = {f: getattr(ms.ops, f) for f in dir(ms.ops)} + mint_ops_func = {f: getattr(ms.mint, f) for f in dir(ms.mint)} + mint_func_ops_func = {f: getattr(ms.mint.nn.functional, f) for f in dir(ms.mint.nn.functional)} + return ops_func, mint_ops_func, mint_func_ops_func + + +def get_functional_ops(): + ops_func, mint_ops_func, mint_func_ops_func = load_ops_functions() + with FileOpen(yaml_path, 'r') as f: + config = yaml.safe_load(f) + WrapFunctionalOps = config.get("ops") + WrapMintOps = config.get("mint.ops") + WrapMintFunctionalOps = config.get("mint.nn.functional") + return ( + set(WrapFunctionalOps) & set(ops_func.keys()), + set(WrapMintOps) & set(mint_ops_func.keys()), + set(WrapMintFunctionalOps) & set(mint_func_ops_func.keys()) + ) + + +class HOOKFunctionalOP(object): + pass + + +class HOOKMintOP(object): + pass + + +class HOOKMintNNFunctionalOP(object): + pass + + +class FunctionalOPTemplate(HOOKCell): + def __init__(self, op_name, op_dict, prefix, hook): + self.op_name = op_name + self.op_func = op_dict[op_name] + self.prefix_op_name_ = prefix + str(op_name.split(Const.SEP)[-1]) + Const.SEP + super().__init__(hook) + + def construct(self, *args, **kwargs): + if self.op_name.startswith('dropout'): + return args[0] if args else kwargs.get('input') + return self.op_func(*args, **kwargs) + + +def wrap_functional_op(op_name, op_dict, prefix, hook): + def op_template(*args, **kwargs): + return FunctionalOPTemplate(op_name, op_dict, prefix, hook)(*args, **kwargs) + return op_template + + +def wrap_functional_ops_and_bind(ops, op_dict, prefix, hook, hook_class): + for op_name in ops: + if callable(op_dict[op_name]): + setattr(hook_class, Const.ATTR_NAME_PREFIX + op_name, wrap_functional_op(op_name, op_dict, prefix, hook)) + + +def setup_hooks(hook): + functional_ops, mint_ops, mint_func_ops = get_functional_ops() + wrap_functional_ops_and_bind( + functional_ops, {f: getattr(ms.ops, f) for f in dir(ms.ops)}, "Functional.", hook, HOOKFunctionalOP) + wrap_functional_ops_and_bind( + mint_ops, {f: getattr(ms.mint, f) for f in dir(ms.mint)}, "Mint.", hook, HOOKMintOP) + wrap_functional_ops_and_bind( + mint_func_ops, {f: getattr(ms.mint.nn.functional, f) for f in dir(ms.mint.nn.functional)}, "MintFunctional.", hook, HOOKMintNNFunctionalOP) + diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_tensor.py b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_tensor.py new file mode 100644 index 00000000000..ae6a9a979dd --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/wrap_tensor.py @@ -0,0 +1,66 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import yaml +import mindspore as ms + +from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell +from msprobe.core.common.utils import Const +from msprobe.core.common.file_check import FileOpen + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapTensorOps = yaml.safe_load(f).get('tensor') + +TensorFunc = {} +for f in dir(ms.Tensor): + TensorFunc[f] = getattr(ms.Tensor, f) + + +def get_tensor_ops(): + global WrapTensorOps + _tensor_ops = dir(ms.Tensor) + return set(WrapTensorOps) & set(_tensor_ops) + + +class HOOKTensor(object): + pass + + +class TensorOPTemplate(HOOKCell): + + def __init__(self, op_name, hook): + self.op_name_ = op_name + self.prefix_op_name_ = "Tensor." + str(op_name) + Const.SEP + super().__init__(hook) + + def construct(self, *args, **kwargs): + return TensorFunc[str(self.op_name_)](*args, **kwargs) + + +def wrap_tensor_op(op_name, hook): + def tensor_op_template(*args, **kwargs): + return TensorOPTemplate(op_name, hook)(*args, **kwargs) + + return tensor_op_template + + +def wrap_tensor_ops_and_bind(hook): + _tensor_ops = get_tensor_ops() + for op_name in _tensor_ops: + if callable(TensorFunc[op_name]): + setattr(HOOKTensor, Const.ATTR_NAME_PREFIX + str(op_name), wrap_tensor_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py b/debug/accuracy_tools/msprobe/mindspore/dump/kernel_graph_dump.py similarity index 92% rename from debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py rename to debug/accuracy_tools/msprobe/mindspore/dump/kernel_graph_dump.py index 190e6bc4d55..8320ee09064 100644 --- a/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py +++ b/debug/accuracy_tools/msprobe/mindspore/dump/kernel_graph_dump.py @@ -1,9 +1,9 @@ import os import json -from atat.core.common.utils import make_dump_path_if_not_exists -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.core.common.log import logger -from atat.core.common.file_check import FileOpen +from msprobe.core.common.utils import make_dump_path_if_not_exists +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.core.common.log import logger +from msprobe.core.common.file_check import FileOpen class KernelGraphDump: diff --git a/debug/accuracy_tools/atat/mindspore/ms_config.py b/debug/accuracy_tools/msprobe/mindspore/ms_config.py similarity index 67% rename from debug/accuracy_tools/atat/mindspore/ms_config.py rename to debug/accuracy_tools/msprobe/mindspore/ms_config.py index 02cead32f1f..c0ef6bb6c00 100644 --- a/debug/accuracy_tools/atat/mindspore/ms_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/ms_config.py @@ -1,6 +1,7 @@ import json -from atat.core.common_config import CommonConfig, BaseConfig -from atat.core.common.file_check import FileOpen +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.core.common.file_check import FileOpen +from msprobe.core.common.const import Const class TensorConfig(BaseConfig): @@ -31,39 +32,43 @@ class StatisticsConfig(BaseConfig): if self.data_mode is not None and len(self.data_mode) > 0: if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: raise Exception("data_mode must be all, input or output") + if self.summary_mode and self.summary_mode not in ["statistics", "md5"]: + raise Exception("summary_mode is invalid") -class OverflowCheck(BaseConfig): +class OverflowCheckConfig(BaseConfig): def __init__(self, json_config): super().__init__(json_config) - self.file_format = None - self.check_mode = json_config.get("check_mode") + self.data_mode = ["all"] self._check_config() def _check_config(self): - if self.data_mode is not None and len(self.data_mode) > 0: - if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: - raise Exception("data_mode must be all, input or output") + if self.overflow_nums is not None and not isinstance(self.overflow_nums, int): + raise Exception("overflow_nums is invalid, it should be an integer") + if self.overflow_nums is not None and self.overflow_nums != -1 and self.overflow_nums <= 0: + raise Exception("overflow_nums should be -1 or positive integer") if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]: raise Exception("check_mode is invalid") +TaskDict = { + Const.TENSOR: TensorConfig, + Const.STATISTICS: StatisticsConfig, + Const.OVERFLOW_CHECK: OverflowCheckConfig, +} + + def parse_common_config(json_config): return CommonConfig(json_config) def parse_task_config(task, json_config): - task_map = json_config[task] + task_map = json_config.get(task) if not task_map: task_map = dict() - if task == "tensor": - return TensorConfig(task_map) - elif task == "statistics": - return StatisticsConfig(task_map) - elif task == "overflow_check": - return OverflowCheck(task_map) - else: + if task not in TaskDict: raise Exception("task is invalid.") + return TaskDict.get(task)(task_map) def parse_json_config(json_file_path): @@ -73,6 +78,6 @@ def parse_json_config(json_file_path): json_config = json.load(file) common_config = parse_common_config(json_config) if not common_config.task: - common_config.task = "statistics" + common_config.task = Const.STATISTICS task_config = parse_task_config(common_config.task, json_config) return common_config, task_config diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py b/debug/accuracy_tools/msprobe/mindspore/overflow_check/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py rename to debug/accuracy_tools/msprobe/mindspore/overflow_check/__init__.py diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py b/debug/accuracy_tools/msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py similarity index 89% rename from debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py rename to debug/accuracy_tools/msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py index 7a677eb3c70..6640608735d 100644 --- a/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py +++ b/debug/accuracy_tools/msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py @@ -1,9 +1,9 @@ import os import json -from atat.core.common.utils import make_dump_path_if_not_exists -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.core.common.log import logger -from atat.core.common.file_check import FileOpen +from msprobe.core.common.utils import make_dump_path_if_not_exists +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.core.common.log import logger +from msprobe.core.common.file_check import FileOpen class KernelGraphOverflowCheck: diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py similarity index 81% rename from debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py rename to debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py index fe53359be1b..d809c714211 100644 --- a/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py @@ -1,5 +1,5 @@ -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck class OverflowCheckToolFactory: diff --git a/debug/accuracy_tools/msprobe/mindspore/service.py b/debug/accuracy_tools/msprobe/mindspore/service.py new file mode 100644 index 00000000000..50776aaf109 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/service.py @@ -0,0 +1,152 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import copy +from pathlib import Path +import functools +from collections import defaultdict + +from msprobe.core.data_dump.data_collector import build_data_collector +from msprobe.core.data_dump.scope import BaseScope +from msprobe.mindspore.common.utils import get_rank_if_initialized +from msprobe.core.common.file_check import FileChecker, FileCheckConst, check_path_before_create +from msprobe.mindspore.common.log import logger +from msprobe.core.common.utils import Const +from msprobe.core.common.exceptions import DistributedNotInitializedError +from msprobe.mindspore.dump.hook_cell.api_registry import api_register +from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs +from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell + + +class Service: + def __init__(self, config): + self.model = None + self.config = copy.deepcopy(config) + self.config.level = self.config.level_ori + self.data_collector = build_data_collector(self.config) + self.switch = False + self.current_iter = 0 + self.first_start = True + self.current_rank = None + self.dump_iter_dir = None + self.start_call = False + + def build_hook(self, module_type, name): + def forward_hook(api_or_module_name, module, input, output): + self.data_collector.visit_and_clear_overflow_status(api_or_module_name) + if not self.switch: + return None + if self.data_collector: + module_input_output = ModuleForwardInputsOutputs(args=input, kwargs=module.input_kwargs, output=output) + self.data_collector.forward_data_collect(api_or_module_name, module, pid, module_input_output) + if self.data_collector.if_return_forward_new_output(): + return self.data_collector.get_forward_new_output() + del module.input_kwargs + return output + + def backward_hook(api_or_module_name, module, grad_input, grad_output): + self.data_collector.visit_and_clear_overflow_status(api_or_module_name) + if not self.switch: + return + if self.data_collector: + module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_input, grad_output=grad_output) + self.data_collector.backward_data_collect(api_or_module_name, module, pid, module_input_output) + + pid = os.getpid() + forward_name_template = name + Const.FORWARD + backward_name_template = name + Const.BACKWARD + forward_hook = functools.partial(forward_hook, forward_name_template) + backward_hook = functools.partial(backward_hook, backward_name_template) + + def wrap_forward_hook(*args, **kwargs): + return forward_hook(*args, **kwargs) + + def wrap_backward_hook(*args, **kwargs): + return backward_hook(*args, **kwargs) + + return wrap_forward_hook, wrap_backward_hook + + def step(self): + self.current_iter += 1 + self.data_collector.update_iter(self.current_iter) + HOOKCell.cell_count = defaultdict(int) + + def start(self, model=None): + self.model = model + self.start_call = True + logger.info("msprobe: debugger.start() is set successfully") + if self.config.step and self.current_iter > max(self.config.step): + self.stop() + raise Exception("msprobe: exit after iteration {}".format(max(self.config.step))) + if self.config.step and self.current_iter not in self.config.step: + return + if self.first_start: + try: + self.current_rank = get_rank_if_initialized() + except DistributedNotInitializedError: + self.current_rank = None + + if self.config.rank and self.current_rank not in self.config.rank: + return + self.register_hook_new() + self.first_start = False + self.switch = True + logger.info(f"Dump switch is turned on at step {self.current_iter}. ") + self.create_dirs() + logger.info(f"Dump data will be saved in {self.dump_iter_dir}.") + + def stop(self): + logger.info("msprobe: debugger.stop() is set successfully. " + "Please set debugger.start() to turn on the dump switch again. ") + if not self.start_call: + logger.error("msprobe: debugger.start() is not set in the current scope.") + raise Exception("debugger.start() is not set in the current scope.") + if self.config.step and self.current_iter not in self.config.step: + return + if self.config.rank and self.current_rank not in self.config.rank: + return + self.switch = False + self.start_call = False + self.data_collector.write_json() + + def create_dirs(self): + check_path_before_create(self.config.dump_path) + if not os.path.exists(self.config.dump_path): + Path(self.config.dump_path).mkdir(mode=0o750, exist_ok=True) + file_check = FileChecker(self.config.dump_path, FileCheckConst.DIR) + file_check.common_check() + self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}") + cur_rank = self.current_rank if self.current_rank is not None else '' + dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}") + if not os.path.exists(dump_dir): + Path(dump_dir).mkdir(mode=0o750, parents=True, exist_ok=True) + if self.config.task in self.data_collector.tasks_need_tensor_data: + dump_data_dir = os.path.join(dump_dir, "dump_tensor_data") + Path(dump_data_dir).mkdir(mode=0o750, exist_ok=True) + else: + dump_data_dir = None + + dump_file_path = os.path.join(dump_dir, "dump.json") + stack_file_path = os.path.join(dump_dir, "stack.json") + construct_file_path = os.path.join(dump_dir, "construct.json") + self.data_collector.update_dump_paths( + dump_file_path, stack_file_path, construct_file_path, dump_data_dir, None) + + def register_hook_new(self): + logger.info("The {} hook function is successfully mounted to the model.".format(self.config.task)) + if self.config.level == "L1": + api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API)) + api_register.api_set_hook_func() diff --git a/debug/accuracy_tools/atat/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py similarity index 68% rename from debug/accuracy_tools/atat/mindspore/task_handler_factory.py rename to debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py index 4f80e4e89c9..7b7e6fd889c 100644 --- a/debug/accuracy_tools/atat/mindspore/task_handler_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py @@ -1,6 +1,6 @@ -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.dump.dump_tool_factory import DumpToolFactory -from atat.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory +from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory class TaskHandlerFactory: diff --git a/debug/accuracy_tools/atat/atat.py b/debug/accuracy_tools/msprobe/msprobe.py similarity index 82% rename from debug/accuracy_tools/atat/atat.py rename to debug/accuracy_tools/msprobe/msprobe.py index 90f8215b102..698165b6150 100644 --- a/debug/accuracy_tools/atat/atat.py +++ b/debug/accuracy_tools/msprobe/msprobe.py @@ -15,19 +15,19 @@ import argparse import sys -from atat.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command -from atat.pytorch.parse_tool.cli import parse as cli_parse -from atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut -from atat.pytorch.api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, \ +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command +from msprobe.pytorch.parse_tool.cli import parse as cli_parse +from msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut +from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, \ _api_precision_compare_command -from atat.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, \ +from msprobe.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, \ _run_overflow_check_command def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, - description="atat(ascend training accuracy tools), [Powered by MindStudio].\n" + description="msprobe(mindstudio probe), [Powered by MindStudio].\n" "Providing one-site accuracy difference debugging toolkit for training on Ascend Devices.\n" f"For any issue, refer README.md first", ) diff --git a/debug/accuracy_tools/atat/pytorch/__init__.py b/debug/accuracy_tools/msprobe/pytorch/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor.py b/debug/accuracy_tools/msprobe/pytorch/advisor/advisor.py similarity index 93% rename from debug/accuracy_tools/atat/pytorch/advisor/advisor.py rename to debug/accuracy_tools/msprobe/pytorch/advisor/advisor.py index 43b3f40f979..b178664d9e3 100644 --- a/debug/accuracy_tools/atat/pytorch/advisor/advisor.py +++ b/debug/accuracy_tools/msprobe/pytorch/advisor/advisor.py @@ -17,12 +17,12 @@ import os -from atat.pytorch.advisor.advisor_result import AdvisorResult -from atat.pytorch.advisor.advisor_const import AdvisorConst -from atat.pytorch.common.log import logger -from atat.core.common.utils import CompareException -from atat.core.common.file_check import FileChecker -from atat.core.common.const import Const, CompareConst, FileCheckConst +from msprobe.pytorch.advisor.advisor_result import AdvisorResult +from msprobe.pytorch.advisor.advisor_const import AdvisorConst +from msprobe.pytorch.common.log import logger +from msprobe.core.common.utils import CompareException +from msprobe.core.common.file_check import FileChecker +from msprobe.core.common.const import Const, CompareConst, FileCheckConst class Advisor: """ diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py b/debug/accuracy_tools/msprobe/pytorch/advisor/advisor_const.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py rename to debug/accuracy_tools/msprobe/pytorch/advisor/advisor_const.py diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py b/debug/accuracy_tools/msprobe/pytorch/advisor/advisor_result.py similarity index 90% rename from debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py rename to debug/accuracy_tools/msprobe/pytorch/advisor/advisor_result.py index a24fa2a1155..456f542e1f5 100644 --- a/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py +++ b/debug/accuracy_tools/msprobe/pytorch/advisor/advisor_result.py @@ -17,10 +17,10 @@ import os import time -from atat.pytorch.advisor.advisor_const import AdvisorConst -from atat.pytorch.common.log import logger -from atat.core.common.const import Const, FileCheckConst -from atat.core.common.file_check import change_mode +from msprobe.pytorch.advisor.advisor_const import AdvisorConst +from msprobe.pytorch.common.log import logger +from msprobe.core.common.const import Const, FileCheckConst +from msprobe.core.common.file_check import change_mode class AdvisorResult: diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/.keep b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/.keep similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/.keep rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/.keep diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/__init__.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/.keep b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/.keep similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/.keep rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/.keep diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/__init__.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/config.py similarity index 58% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/config.py index 0aceb691b25..760e7c862db 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/config.py @@ -1,10 +1,8 @@ import os import yaml -from atat.pytorch.api_accuracy_checker.common.utils import check_file_or_directory_path -from atat.pytorch.hook_module.utils import WrapFunctionalOps, WrapTensorOps, WrapTorchOps -from atat.core.common.file_check import FileOpen - -WrapApi = set(WrapFunctionalOps) | set(WrapTensorOps) | set(WrapTorchOps) +from msprobe.pytorch.api_accuracy_checker.common.utils import check_file_or_directory_path +from msprobe.core.common.file_check import FileOpen +from msprobe.pytorch.pt_config import RunUTConfig class Config: @@ -14,9 +12,17 @@ class Config: config = yaml.safe_load(file) self.config = {key: self.validate(key, value) for key, value in config.items()} - def validate(self, key, value): + def __getattr__(self, item): + return self.config[item] + + def __str__(self): + return '\n'.join(f"{key}={value}" for key, value in self.config.items()) + + @staticmethod + def validate(key, value): validators = { 'white_list': list, + 'black_list': list, 'error_data_path': str, 'precision': int } @@ -27,22 +33,13 @@ class Config: if key == 'precision' and value < 0: raise ValueError("precision must be greater than 0") if key == 'white_list': - if not isinstance(value, list): - raise ValueError("white_list must be a list type") - if not all(isinstance(i, str) for i in value): - raise ValueError("All elements in white_list must be of str type") - invalid_api = [i for i in value if i not in WrapApi] - if invalid_api: - raise ValueError( - f"{', '.join(invalid_api)} is not in support_wrap_ops.yaml, please check the white_list") + RunUTConfig.check_filter_list_config(key, value) + if key == 'black_list': + RunUTConfig.check_filter_list_config(key, value) + if key == 'error_data_path': + RunUTConfig.check_error_data_path_config(value) return value - def __getattr__(self, item): - return self.config[item] - - def __str__(self): - return '\n'.join(f"{key}={value}" for key, value in self.config.items()) - cur_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) yaml_path = os.path.join(cur_path, "config.yaml") diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/utils.py similarity index 96% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/utils.py index 9e1b02c0154..b6e8932960c 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/common/utils.py @@ -28,10 +28,10 @@ except ImportError: else: IS_GPU = False -from atat.pytorch.common.log import logger -from atat.core.common.file_check import FileChecker, FileOpen, change_mode, create_directory -from atat.core.common.const import Const, FileCheckConst -from atat.core.common.utils import CompareException +from msprobe.pytorch.common.log import logger +from msprobe.core.common.file_check import FileChecker, FileOpen, change_mode, create_directory +from msprobe.core.common.const import Const, FileCheckConst +from msprobe.core.common.utils import CompareException class DumpException(CompareException): @@ -166,6 +166,7 @@ def initialize_save_path(save_path, dir_name): os.mkdir(data_path, mode=FileCheckConst.DATA_DIR_AUTHORITY) data_path_checker = FileChecker(data_path, FileCheckConst.DIR) data_path_checker.common_check() + return data_path def write_pt(file_path, tensor): diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/__init__.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/algorithm.py similarity index 98% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/algorithm.py index 3982c167cca..1bb19cc048e 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/algorithm.py @@ -2,8 +2,8 @@ import torch import numpy as np -from atat.pytorch.api_accuracy_checker.compare.compare_utils import ULP_PARAMETERS -from atat.core.common.const import CompareConst +from msprobe.pytorch.api_accuracy_checker.compare.compare_utils import ULP_PARAMETERS +from msprobe.core.common.const import CompareConst DEFAULT_THRESHOLD = 1 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py index f73c83c4881..73bf7c2b8eb 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py @@ -7,19 +7,19 @@ from collections import namedtuple import torch import pandas as pd -from atat.pytorch.api_accuracy_checker.common.utils import write_csv -from atat.pytorch.api_accuracy_checker.common.config import msCheckerConfig -from atat.pytorch.api_accuracy_checker.compare.compare_utils import API_PRECISION_COMPARE_RESULT_FILE_NAME, \ +from msprobe.pytorch.api_accuracy_checker.common.utils import write_csv +from msprobe.pytorch.api_accuracy_checker.common.config import msCheckerConfig +from msprobe.pytorch.api_accuracy_checker.compare.compare_utils import API_PRECISION_COMPARE_RESULT_FILE_NAME, \ API_PRECISION_COMPARE_DETAILS_FILE_NAME, BENCHMARK_COMPARE_SUPPORT_LIST, API_PRECISION_COMPARE_UNSUPPORT_LIST, \ ApiPrecisionCompareColumn, AbsoluteStandardApi, BinaryStandardApi, ULPStandardApi, ThousandthStandardApi, \ BINARY_COMPARE_UNSUPPORT_LIST, ULP_COMPARE_SUPPORT_LIST, convert_str_to_float, CompareMessage, is_inf_or_nan, \ check_inf_or_nan -from atat.pytorch.api_accuracy_checker.compare.compare_column import ApiPrecisionOutputColumn -from atat.pytorch.api_accuracy_checker.run_ut.run_ut import get_validated_result_csv_path -from atat.core.common.file_check import FileChecker, change_mode, check_path_before_create, create_directory -from atat.pytorch.common.log import logger -from atat.core.common.utils import CompareException -from atat.core.common.const import CompareConst, FileCheckConst +from msprobe.pytorch.api_accuracy_checker.compare.compare_column import ApiPrecisionOutputColumn +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import get_validated_result_csv_path +from msprobe.core.common.file_check import FileChecker, change_mode, check_path_before_create, create_directory +from msprobe.pytorch.common.log import logger +from msprobe.core.common.utils import CompareException +from msprobe.core.common.const import CompareConst, FileCheckConst CompareConfig = namedtuple('CompareConfig', ['npu_csv_path', 'gpu_csv_path', 'result_csv_path', 'details_csv_path']) BenchmarkInf_Nan_Consistency = namedtuple('BenchmarkInf_Nan_Consistency', ['small_value_inf_nan_consistency', diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare.py index ca35c8ed5da..ee49588288e 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare.py @@ -3,18 +3,18 @@ import os from collections import namedtuple import torch import numpy as np -from atat.pytorch.common.log import logger -from atat.pytorch.api_accuracy_checker.common.utils import get_json_contents, write_csv -from atat.pytorch.api_accuracy_checker.compare.compare_utils import check_dtype_comparable, \ +from msprobe.pytorch.common.log import logger +from msprobe.pytorch.api_accuracy_checker.common.utils import get_json_contents, write_csv +from msprobe.pytorch.api_accuracy_checker.compare.compare_utils import check_dtype_comparable, \ DETAIL_TEST_ROWS, precision_configs, BENCHMARK_COMPARE_SUPPORT_LIST, AbsoluteStandardApi, BinaryStandardApi, \ ULPStandardApi, ThousandthStandardApi, apis_threshold -from atat.pytorch.api_accuracy_checker.compare.compare_column import CompareColumn -from atat.pytorch.api_accuracy_checker.compare.algorithm import get_rmse, get_error_balance, get_max_rel_err, \ +from msprobe.pytorch.api_accuracy_checker.compare.compare_column import CompareColumn +from msprobe.pytorch.api_accuracy_checker.compare.algorithm import get_rmse, get_error_balance, get_max_rel_err, \ get_mean_rel_err, get_rel_err, get_abs_err, get_max_abs_err, get_rel_err_ratio, cosine_sim, get_rel_err_origin, \ get_small_value_err_ratio, get_finite_and_infinite_mask, get_small_value_mask, check_inf_nan_value, \ check_small_value, check_norm_value, get_abs_bench_with_eps, get_ulp_err -from atat.pytorch.api_accuracy_checker.common.config import msCheckerConfig -from atat.core.common.const import Const, CompareConst +from msprobe.pytorch.api_accuracy_checker.common.config import msCheckerConfig +from msprobe.core.common.const import Const, CompareConst ResultInfo = namedtuple('ResultInfo', ['full_api_name', 'fwd_success_status', 'bwd_success_status', diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare_column.py similarity index 98% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare_column.py index 9867a76fadf..fb6d5dcc0f1 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare_column.py @@ -1,4 +1,4 @@ -from atat.core.common.const import CompareConst +from msprobe.core.common.const import CompareConst class CompareColumn: diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py index b7b32e41e47..5c7e86ff36c 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py @@ -5,10 +5,10 @@ import math import numpy as np import torch import yaml -from atat.core.common.utils import CompareException -from atat.core.common.const import Const -from atat.pytorch.common.log import logger -from atat.core.common.file_check import FileOpen +from msprobe.core.common.utils import CompareException +from msprobe.core.common.const import Const +from msprobe.pytorch.common.log import logger +from msprobe.core.common.file_check import FileOpen current_time = time.strftime("%Y%m%d%H%M%S") diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/config.yaml similarity index 77% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/config.yaml index 7f26c72aa31..2dac535dc05 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/config.yaml @@ -1,4 +1,5 @@ white_list: [] +black_list: [] error_data_path: './' precision: 14 \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/.keep b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/.keep similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/.keep rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/.keep diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/__init__.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py similarity index 95% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py index 97b2dcc7e40..b2eec691af0 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -20,10 +20,12 @@ import math import torch import numpy -from atat.pytorch.api_accuracy_checker.run_ut.run_ut_utils import hf_32_standard_api -from atat.pytorch.api_accuracy_checker.common.utils import check_file_or_directory_path, check_object_type, get_full_data_path, CompareException -from atat.pytorch.common.log import logger -from atat.core.common.const import Const +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import hf_32_standard_api +from msprobe.pytorch.api_accuracy_checker.common.utils import check_object_type, get_full_data_path, \ + CompareException +from msprobe.core.common.file_check import FileChecker +from msprobe.pytorch.common.log import logger +from msprobe.core.common.const import Const, FileCheckConst TORCH_TYPE = ["torch.device", "torch.dtype"] TENSOR_DATA_LIST = ["torch.Tensor", "torch.nn.parameter.Parameter"] @@ -86,12 +88,13 @@ def gen_real_tensor(data_path, convert_type): convert_type: convert ori_type to dist_type flag. """ data_path = os.path.realpath(data_path) - check_file_or_directory_path(data_path) + data_path_checker = FileChecker(data_path, FileCheckConst.FILE, ability=FileCheckConst.READ_ABLE) + data_path = data_path_checker.common_check() if not data_path.endswith('.pt') and not data_path.endswith('.npy'): error_info = f"The file: {data_path} is not a pt or numpy file." raise CompareException(CompareException.INVALID_FILE_ERROR, error_info) if data_path.endswith('.pt'): - data = torch.load(data_path).cpu() + data = torch.load(data_path, map_location=torch.device('cpu')) else: data_np = numpy.load(data_path) data = torch.from_numpy(data_np) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py similarity index 86% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py index d2ab9c1e952..9acb5ee6498 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py @@ -9,14 +9,14 @@ import threading from collections import namedtuple from itertools import cycle from tqdm import tqdm -from atat.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, get_validated_result_csv_path, \ +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, get_validated_result_csv_path, \ get_validated_details_csv_path, preprocess_forward_content -from atat.pytorch.api_accuracy_checker.compare.compare import Comparator -from atat.pytorch.common import parse_json_info_forward_backward -from atat.core.common.file_check import FileChecker, check_file_suffix, check_link, FileOpen, \ +from msprobe.pytorch.api_accuracy_checker.compare.compare import Comparator +from msprobe.pytorch.common import parse_json_info_forward_backward +from msprobe.core.common.file_check import FileChecker, check_file_suffix, check_link, FileOpen, \ check_path_before_create, create_directory -from atat.pytorch.common.log import logger -from atat.core.common.const import FileCheckConst +from msprobe.pytorch.common.log import logger +from msprobe.core.common.const import FileCheckConst def split_json_file(input_file, num_splits, filter_api): @@ -68,7 +68,7 @@ signal.signal(signal.SIGTERM, signal_handler) ParallelUTConfig = namedtuple('ParallelUTConfig', ['api_files', 'out_path', 'num_splits', 'save_error_data_flag', 'jit_compile_flag', 'device_id', - 'result_csv_path', 'total_items', 'real_data_path']) + 'result_csv_path', 'total_items', 'config_path']) def run_parallel_ut(config): @@ -90,7 +90,7 @@ def run_parallel_ut(config): *(['-j'] if config.jit_compile_flag else []), *(['-save_error_data'] if config.save_error_data_flag else []), '-csv_path', config.result_csv_path, - *(['-real_data_path', config.real_data_path] if config.real_data_path else []) + *(['-config', config.config_path] if config.config_path else []) ] return cmd @@ -110,14 +110,9 @@ def run_parallel_ut(config): def update_progress_bar(progress_bar, result_csv_path): while any(process.poll() is None for process in processes): - try: - with open(result_csv_path, 'r') as result_file: - completed_items = len(result_file.readlines()) - 1 - progress_bar.update(completed_items - progress_bar.n) - except FileNotFoundError: - logger.warning(f"Result CSV file not found: {result_csv_path}.") - except Exception as e: - logger.error(f"An unexpected error occurred while reading result CSV: {e}") + with FileOpen(result_csv_path, 'r') as result_file: + completed_items = len(result_file.readlines()) - 1 + progress_bar.update(completed_items - progress_bar.n) time.sleep(1) for api_info in config.api_files: @@ -175,7 +170,7 @@ def prepare_config(args): out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE) out_path = out_path_checker.common_check() split_files, total_items = split_json_file(api_info, args.num_splits, args.filter_api) - + config_path = os.path.realpath(args.config_path) if args.config_path else None result_csv_path = args.result_csv_path or os.path.join(out_path, f"accuracy_checking_result_{time.strftime('%Y%m%d%H%M%S')}.csv") if not args.result_csv_path: details_csv_path = os.path.join(out_path, f"accuracy_checking_details_{time.strftime('%Y%m%d%H%M%S')}.csv") @@ -187,7 +182,7 @@ def prepare_config(args): logger.info(f"UT task details will be saved in {details_csv_path}") return ParallelUTConfig(split_files, out_path, args.num_splits, args.save_error_data, args.jit_compile, args.device_id, result_csv_path, - total_items, args.real_data_path) + total_items, config_path) def main(): diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py similarity index 85% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py index c5834e9a8c7..732745ee8ca 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py @@ -10,10 +10,13 @@ else: is_gpu = False import torch from tqdm import tqdm -from atat.pytorch.api_accuracy_checker.run_ut.run_ut import exec_api, generate_device_params, get_api_info -from atat.pytorch.api_accuracy_checker.common.utils import get_json_contents -from atat.core.common.file_check import check_link -from atat.pytorch.common.log import logger +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import exec_api, generate_device_params, get_api_info +from msprobe.pytorch.api_accuracy_checker.common.utils import get_json_contents +from msprobe.core.common.file_check import check_link +from msprobe.pytorch.common.log import logger +from msprobe.pytorch.common.parse_json import parse_json_info_forward_backward +from msprobe.core.common.const import Const + def check_tensor_overflow(x): if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool: @@ -52,12 +55,12 @@ def check_data_overflow(x): def run_overflow_check(forward_file): logger.info("start UT test") - forward_content = get_json_contents(forward_file) + forward_content, _, real_data_path = parse_json_info_forward_backward(forward_file) for api_full_name, api_info_dict in tqdm(forward_content.items()): try: - run_torch_api(api_full_name, api_info_dict) + run_torch_api(api_full_name, api_info_dict, real_data_path) except Exception as err: - api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0] + _, api_name, _ = api_full_name.split(Const.SEP) if "not implemented for 'Half'" in str(err): logger.warning(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API " f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.") @@ -68,11 +71,10 @@ def run_overflow_check(forward_file): logger.error(f"Run {api_full_name} UT Error: %s" % str(err)) -def run_torch_api(api_full_name, api_info_dict): +def run_torch_api(api_full_name, api_info_dict, real_data_path): torch.npu.clear_npu_overflow_flag() - api_type = api_full_name.split(".")[0] - api_name = api_full_name.split(".", 1)[1].rsplit(".", 2)[0] - args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path='') + api_type, api_name, _ = api_full_name.split(Const.SEP) + args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path) if not need_grad: logger.warning("%s function with out=... arguments don't support automatic differentiation, skip backward." % api_full_name) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py similarity index 85% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py index cd83a95801f..559dfdc0f14 100644 --- a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py @@ -18,28 +18,32 @@ else: import torch from tqdm import tqdm -from atat.pytorch.api_accuracy_checker.run_ut.run_ut_utils import Backward_Message, hf_32_standard_api -from atat.pytorch.api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args -from atat.pytorch.api_accuracy_checker.common.utils import get_json_contents, api_info_preprocess, \ +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import Backward_Message, hf_32_standard_api +from msprobe.pytorch.api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args +from msprobe.pytorch.api_accuracy_checker.common.utils import get_json_contents, api_info_preprocess, \ initialize_save_path, UtDataProcessor -from atat.pytorch.api_accuracy_checker.compare.compare import Comparator -from atat.pytorch.api_accuracy_checker.compare.compare_column import CompareColumn -from atat.pytorch.hook_module.wrap_tensor import TensorOPTemplate -from atat.pytorch.hook_module.wrap_functional import FunctionalOPTemplate -from atat.pytorch.hook_module.wrap_torch import TorchOPTemplate -from atat.pytorch.api_accuracy_checker.common.config import msCheckerConfig -from atat.pytorch.common.parse_json import parse_json_info_forward_backward -from atat.core.common.file_check import FileOpen, FileChecker, \ +from msprobe.pytorch.api_accuracy_checker.compare.compare import Comparator +from msprobe.pytorch.api_accuracy_checker.compare.compare_column import CompareColumn +from msprobe.pytorch.hook_module.wrap_tensor import TensorOPTemplate +from msprobe.pytorch.hook_module.wrap_functional import FunctionalOPTemplate +from msprobe.pytorch.hook_module.wrap_torch import TorchOPTemplate +from msprobe.pytorch.hook_module.wrap_npu_custom import NpuOPTemplate +from msprobe.pytorch.hook_module.wrap_aten import AtenOPTemplate +from msprobe.pytorch.api_accuracy_checker.common.config import msCheckerConfig +from msprobe.pytorch.common.parse_json import parse_json_info_forward_backward +from msprobe.core.common.file_check import FileOpen, FileChecker, \ change_mode, check_file_suffix, check_link, check_path_before_create, create_directory -from atat.pytorch.common.log import logger -from atat.core.common.const import Const, FileCheckConst, CompareConst +from msprobe.pytorch.common.log import logger +from msprobe.pytorch.pt_config import parse_json_config +from msprobe.core.common.const import Const, FileCheckConst, CompareConst current_time = time.strftime("%Y%m%d%H%M%S") UT_ERROR_DATA_DIR = 'ut_error_data' + current_time RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv" DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv" RunUTConfig = namedtuple('RunUTConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path', - 'save_error_data', 'is_continue_run_ut', 'real_data_path']) + 'save_error_data', 'is_continue_run_ut', 'real_data_path', 'white_list', + 'black_list', 'error_data_path']) not_backward_list = ['repeat_interleave'] not_detach_set = {'resize_', 'resize_as_', 'set_', 'transpose_', 't_', 'squeeze_', 'unsqueeze_'} not_raise_dtype_set = {'type_as'} @@ -76,6 +80,12 @@ def exec_api(api_type, api_name, args, kwargs): if api_type == "Torch": torch_api = TorchOPTemplate(api_name, str, False) out = torch_api.forward(*args, **kwargs) + if api_type == "Aten": + torch_api = AtenOPTemplate(api_name, None, False) + out = torch_api.forward(*args, **kwargs) + if api_type == "NPU": + torch_api = NpuOPTemplate(api_name, None, False) + out = torch_api.forward(*args, **kwargs) return out @@ -176,8 +186,7 @@ def run_ut(config): logger.info(f"UT task result will be saved in {config.result_csv_path}") logger.info(f"UT task details will be saved in {config.details_csv_path}") if config.save_error_data: - error_data_path = os.path.abspath(os.path.join(msCheckerConfig.error_data_path, UT_ERROR_DATA_DIR)) - logger.info(f"UT task error_datas will be saved in {error_data_path}") + logger.info(f"UT task error_datas will be saved in {config.error_data_path}") compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut) with FileOpen(config.result_csv_path, 'r') as file: csv_reader = csv.reader(file) @@ -188,17 +197,17 @@ def run_ut(config): continue if is_unsupported_api(api_full_name): # TODO run_ut does not support to the npu fusion api and distributed api continue + [_, api_name, _] = api_full_name.split(Const.SEP) try: - if msCheckerConfig.white_list: - [_, api_name, _] = api_full_name.split(Const.SEP) - if api_name not in set(msCheckerConfig.white_list): - continue + if config.black_list and api_name in config.black_list: + continue + if config.white_list and api_name not in config.white_list: + continue data_info = run_torch_api(api_full_name, config.real_data_path, config.backward_content, api_info_dict) is_fwd_success, is_bwd_success = compare.compare_output(api_full_name, data_info) if config.save_error_data: - do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) + do_save_error_data(api_full_name, data_info, config.error_data_path, is_fwd_success, is_bwd_success) except Exception as err: - [_, api_name, _] = api_full_name.split(Const.SEP) if "expected scalar type Long" in str(err): logger.warning(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API " f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.") @@ -227,16 +236,16 @@ def is_unsupported_api(api_name): return flag -def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success): +def do_save_error_data(api_full_name, data_info, error_data_path, is_fwd_success, is_bwd_success): if not is_fwd_success or not is_bwd_success: - processor = UtDataProcessor(os.path.join(msCheckerConfig.error_data_path, UT_ERROR_DATA_DIR)) + processor = UtDataProcessor(error_data_path) for element in data_info.in_fwd_data_list: processor.save_tensors_in_element(api_full_name + '.forward.input', element) - processor.save_tensors_in_element(api_full_name + '.forward.output.bench', data_info.bench_out) - processor.save_tensors_in_element(api_full_name + '.forward.output.device', data_info.device_out) + processor.save_tensors_in_element(api_full_name + '.forward.output.bench', data_info.bench_output) + processor.save_tensors_in_element(api_full_name + '.forward.output.device', data_info.device_output) processor.save_tensors_in_element(api_full_name + '.backward.input', data_info.grad_in) - processor.save_tensors_in_element(api_full_name + '.backward.output.bench', data_info.bench_grad_out) - processor.save_tensors_in_element(api_full_name + '.backward.output.device', data_info.device_grad_out) + processor.save_tensors_in_element(api_full_name + '.backward.output.bench', data_info.bench_grad) + processor.save_tensors_in_element(api_full_name + '.backward.output.device', data_info.device_grad) def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict): @@ -273,7 +282,7 @@ def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict if need_backward: if need_to_backward(grad_index, out): - backward_args = backward_content[api_full_name].get("grad_output") + backward_args = backward_content[api_full_name].get("input") grad = gen_args(backward_args, api_name, real_data_path=real_data_path)[0] bench_grad, _ = generate_cpu_params(grad, {}, False, api_name) bench_grad_out = run_backward(cpu_args, bench_grad, grad_index, out) @@ -314,14 +323,14 @@ def run_backward(args, grad, grad_index, out): return grad_out -def initialize_save_error_data(): - error_data_path = msCheckerConfig.error_data_path +def initialize_save_error_data(error_data_path): check_path_before_create(error_data_path) create_directory(error_data_path) - error_data_path_checker = FileChecker(msCheckerConfig.error_data_path, FileCheckConst.DIR, + error_data_path_checker = FileChecker(error_data_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE) error_data_path = error_data_path_checker.common_check() - initialize_save_path(error_data_path, UT_ERROR_DATA_DIR) + error_data_path =initialize_save_path(error_data_path, UT_ERROR_DATA_DIR) + return error_data_path def get_validated_result_csv_path(result_csv_path, mode): @@ -378,12 +387,10 @@ def _run_ut_parser(parser): help=" The path of accuracy_checking_result_{timestamp}.csv, " "when run ut is interrupted, enter the file path to continue run ut.", required=False) - parser.add_argument("-real_data_path", dest="real_data_path", nargs="?", const="", default="", type=str, - help=" In real data mode, the root directory for storing real data " - "must be configured.", - required=False) parser.add_argument("-f", "--filter_api", dest="filter_api", action="store_true", help=" Whether to filter the api in the api_info_file.", required=False) + parser.add_argument("-config", "--config_path", dest="config_path", default="", type=str, + help=" The path of config.json", required=False) def preprocess_forward_content(forward_content): @@ -397,9 +404,9 @@ def preprocess_forward_content(forward_content): if key not in arg_cache: filtered_new_args = [ {k: v for k, v in arg.items() if k not in ['Max', 'Min']} - for arg in value['args'] if isinstance(arg, dict) + for arg in value['input_args'] if isinstance(arg, dict) ] - arg_cache[key] = (filtered_new_args, value['kwargs']) + arg_cache[key] = (filtered_new_args, value['input_kwargs']) filtered_new_args, new_kwargs = arg_cache[key] @@ -464,14 +471,22 @@ def run_ut_command(args): if args.result_csv_path: result_csv_path = get_validated_result_csv_path(args.result_csv_path, 'result') details_csv_path = get_validated_details_csv_path(result_csv_path) + white_list = msCheckerConfig.white_list + black_list = msCheckerConfig.black_list + error_data_path = msCheckerConfig.error_data_path + if args.config_path: + _, task_config = parse_json_config(args.config_path, Const.RUN_UT) + white_list = task_config.white_list + black_list = task_config.black_list + error_data_path = task_config.error_data_path if save_error_data: if args.result_csv_path: time_info = result_csv_path.split('.')[0].split('_')[-1] global UT_ERROR_DATA_DIR UT_ERROR_DATA_DIR = 'ut_error_data' + time_info - initialize_save_error_data() + error_data_path = initialize_save_error_data(error_data_path) run_ut_config = RunUTConfig(forward_content, backward_content, result_csv_path, details_csv_path, save_error_data, - args.result_csv_path, real_data_path) + args.result_csv_path, real_data_path, set(white_list), set(black_list), error_data_path) run_ut(run_ut_config) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json similarity index 100% rename from debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json rename to debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/__init__.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/__init__.py new file mode 100644 index 00000000000..eb06867371c --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/__init__.py @@ -0,0 +1,15 @@ +import os +from pkgutil import iter_modules +from importlib import import_module + +""" +gpu and cpu not implement benchmark function, supplementary benchmarking function implementation +""" + +package_path = os.path.dirname(os.path.realpath(__file__)) +for _, module_name, _ in iter_modules([package_path]): + module = import_module(f"{__name__}.{module_name}") + for attr_name in dir(module): + attr = getattr(module, attr_name) + if callable(attr) and "npu_custom" not in attr_name: + globals()[attr_name] = attr diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/apply_adam_w.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/apply_adam_w.py new file mode 100644 index 00000000000..caf21a604c6 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/apply_adam_w.py @@ -0,0 +1,28 @@ +import torch + + +def npu_apply_adam_w(beta1_power, beta2_power, lr, weight_decay, + beta1, beta2, eps, grad, max_grad_norm, amsgrad, maximize, out): + var, m, v = out + if amsgrad: + max_grad_norm = (torch.rand(var.shape) * 10.0 - 5.0).to(var.dtype) + beta1_power_out = beta1_power * beta1 + beta2_power_out = beta2_power * beta2 + var_t = var * (1 + (-lr * weight_decay)) + gt = -grad if maximize else grad + m_out = m * beta1 - (beta1 + (-1)) * gt + v_out = v * beta2 - (beta2 + (-1)) * gt * gt + + if amsgrad: + max_grad_norm_out = torch.max(max_grad_norm, v_out) + if (1 - beta2_power_out) == 0: + beta2_power_out -= eps + denom = torch.sqrt(torch.div(max_grad_norm_out, (1 - beta2_power_out))) + eps + else: + vraintain = torch.div(v_out, (1 - beta2_power_out)) + denom = torch.sqrt(vraintain) + eps + + if (1 - beta1_power_out) == 0: + beta1_power_out -= eps + var_out = var_t + torch.div(-lr * m_out, (1 - beta1_power_out)).div(denom) + return var_out.cpu(), m_out.cpu(), v_out.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/confusion_transpose.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/confusion_transpose.py new file mode 100644 index 00000000000..627bf11b64f --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/confusion_transpose.py @@ -0,0 +1,19 @@ +def npu_confusion_transpose(data, perm, shape, transpose_first): + if transpose_first: + output = data.permute(*perm).contiguous().view(shape) + else: + output = data.view(shape).permute(*perm) + return output.cpu() + + +def npu_confusion_transpose_backward(grad, perm, shape, transpose_first): + shape_cal = shape if transpose_first else [shape[perm_dim] for perm_dim in perm] + perm_cal = [0] * len(perm) + for i, perm_dim in enumerate(perm): + perm_cal[perm_dim] = i + + if transpose_first: + result = grad.permute(*perm_cal).reshape(shape_cal) + else: + result = grad.reshape(shape_cal).permute(*perm_cal) + return result.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/fast_gelu.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/fast_gelu.py new file mode 100644 index 00000000000..a1a9ca08085 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/fast_gelu.py @@ -0,0 +1,55 @@ +import torch + + +def fast_gelu(input0): + attr = 1.702 + const_0 = 0 - attr + const_1 = 1 + const_2 = attr / 2 + + abs_x = torch.abs(input0) + mul_abs_x = abs_x * const_0 + exp_abs_x = torch.exp(mul_abs_x) + div_down = exp_abs_x + const_1 + + pn_x = input0 - abs_x + mul_pn_x = pn_x * const_2 + exp_pn_x = torch.exp(mul_pn_x) + div_up = input0 * exp_pn_x + div_down_rec = torch.reciprocal(div_down) + result = div_up * div_down_rec + + return result.cpu() + + +def npu_fast_gelu_backward(grad, input_x): + const_2 = 1.702 + const_3 = 1.0 + const_1 = 0.0 - const_2 + + # e^(-1.702x) + abs_x = torch.abs(input_x) + mul_abs_x = abs_x * const_1 + exp_x = torch.exp(mul_abs_x) + + # 1.702xe^(-1.702x) + add_2 = input_x * exp_x + add_2 = add_2 * const_2 + + # e^(1.702(x-|x|)) + pn_x = input_x - abs_x + mul_pn_x = pn_x * const_2 + exp_pn_x = torch.exp(mul_pn_x) + + # e^(-1.702x) + 1.702xe^(-1.702x) + e^(1.702(x-|x|)) + div_up = exp_x + add_2 + div_up = div_up + exp_pn_x + + # (e^(-1.702x)+1)^2 + div_down_i = exp_x + const_3 + div_down = div_down_i * div_down_i + div_down_rec = torch.reciprocal(div_down) + result_temp = div_up * div_down_rec + result = grad * result_temp + + return result.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/layer_norm_eval.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/layer_norm_eval.py new file mode 100644 index 00000000000..f6949c079e2 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/layer_norm_eval.py @@ -0,0 +1,6 @@ +import torch + + +def npu_layer_norm_eval(data, normalized_shape): + result = torch.nn.functional.layer_norm(data, normalized_shape) + return result.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/linear.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/linear.py new file mode 100644 index 00000000000..95db875edf6 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/linear.py @@ -0,0 +1,12 @@ +import torch + + +def npu_linear(x, weight, bias): + output = torch.nn.functional.linear(x, weight, bias) + return output.cpu() + + +def npu_linear_backward(grad, input_data, weight): + input_grad = torch.matmul(grad, weight) + weight_grad = torch.matmul(grad.t(), input_data) + return input_grad.cpu(), weight_grad.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/matmul_backward.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/matmul_backward.py new file mode 100644 index 00000000000..ed1c746ec16 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/matmul_backward.py @@ -0,0 +1,48 @@ +import torch + + +def matmul_backward(grad, self, other, mask): + grad_self, grad_other = None, None + dim_self = self.dim() + dim_other = other.dim() + + size_grad = list(grad.size()) + size_self = list(self.size()) + size_other = list(other.size()) + if dim_self == 1 and dim_other == 1: + grad_self = other.mul(grad) if mask[0] else grad_self + grad_other = self.mul(grad) if mask[1] else grad_other + elif dim_self == 2 and dim_other == 1: + grad_self = grad.unsqueeze(1).mm(other.unsqueeze(0)) if mask[0] else grad_self + grad_other = self.transpose(-1, -2).mm(grad.unsqueeze(1)).squeeze_(1) if mask[1] else grad_other + elif dim_self == 1 and dim_other == 2: + grad_self = grad.unsqueeze(0).mm(other.transpose(-1, -2)).squeeze_(0) if mask[0] else grad_self + grad_other = self.unsqueeze(1).mm(grad.unsqueeze(0)) if mask[1] else grad_other + elif dim_self >= 3 and (dim_other == 1 or dim_other == 2): + view_size = 1 if dim_other == 1 else size_grad[-1] + unfolded_grad = (grad.unsqueeze(-1) if dim_other == 1 else grad).contiguous().view(-1, view_size) + if mask[0]: + grad_self = unfolded_grad.mm(other.unsqueeze(0) if dim_other == 1 else other.transpose(-1, -2)) \ + .view(size_self) + if mask[1]: + unfolded_self = self.contiguous().view([-1, size_self[-1]]) + grad_other = unfolded_self.transpose(-1, -2).mm(unfolded_grad).view(size_other) + elif (dim_self == 1 or dim_self == 2) and dim_other >= 3: + view_size = 1 if dim_self == 1 else size_grad[-2] + unfolded_grad_T = grad.view([-1, view_size]) \ + if dim_self == 1 else grad.transpose(-1, -2).contiguous().view([-1, view_size]) + if mask[0]: + # create a 2D-matrix from other + unfolded_other_T = \ + other.transpose(-1, -2).contiguous().view([-1, size_other[-2]]).transpose(-1, -2) + grad_self = unfolded_other_T.mm(unfolded_grad_T).transpose(-1, -2).view(size_self) + if mask[1]: + size_other_T = size_other[:-2] + size_other_T.extend(size_other[::-1][:2]) + grad_other = \ + unfolded_grad_T.mm(self.unsqueeze(0) if dim_self == 1 else self).view(size_other_T).transpose(-1, -2) + else: + grad_self = torch.matmul(grad, other.transpose(-1, -2)) if mask[0] else grad_self + grad_other = torch.matmul(self.transpose(-1, -2), grad) if mask[1] else grad_other + + return grad_self.cpu(), grad_other.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/npu_fusion_attention.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/npu_fusion_attention.py new file mode 100644 index 00000000000..63f1fa2a3b6 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/npu_fusion_attention.py @@ -0,0 +1,421 @@ +import torch +import numpy as np +from einops import rearrange + +from msprobe.pytorch.common.utils import logger + +gtype = torch.float64 # arm host必须选择float64,x86环境选择float32即可,64也行。arm计算很慢,s=8k的场景建议使用x86 +softmax_build_mode = "QKV" # "MAX_SUM" + +""" +# 前向函数声明对比 +标杆实现:fusion_attention_forward: q, k, v, drop_mask, atten_mask, pse, scale, keep_prob +融合算子:npu_fusion_attention_forward: query, key, value, head_num, input_layout, *, pse=None, padding_mask=None, + atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, + next_tockens=2147483647, inner_precise=0, prefix=None, sparse_mode=0, + gen_mask_parallel=True, sync=False + +# 反向函数声明对比 +标杆实现:fusion_attention_backward: dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob +融合算子:npu_fusion_attention_backward: query, key, value, dy, head_num, input_layout, *, pse=None, padding_mask=None, + atten_mask=None, softmax_max=None, softmax_sum=None, softmax_in=None, + attention_in=None, scale_value=1.0, keep_prob=1.0, pre_tockens=2147483647, + next_tockens=2147483647, inner_precise=0, seed=0, offset=0, + numels=0, prefix=None, sparse_mode=0, gen_mask_parallel=True, sync=False +""" + + +def softmax_forward(x): + x_max = torch.max(x, dim=-1, keepdims=True)[0] + x_sub = x.sub(x_max) + y = torch.exp(x_sub) + x_sum = y.sum(dim=-1, keepdims=True) + res = y.div(x_sum) + return res, x_max, x_sum + + +def softmax_grad(dp, softmax_res): + muls = dp * softmax_res + muls_r = muls.sum(dim=-1, keepdims=True) + sub_r = dp - muls_r + res = sub_r * softmax_res + return res + + +def broadcast_kv(num_heads, num_kv_heads, kv_tensor, dtype): + if num_kv_heads == 0 or num_kv_heads < num_heads: + raise ValueError(f"num_kv_heads must be non-zero and less than num_heads.") + + factor = num_heads // num_kv_heads + kv_shape = kv_tensor.shape + B = kv_shape[0] + S = kv_shape[2] + D = kv_shape[3] + kv_res = torch.zeros([B, num_heads, S, D]).to(dtype) + for i in range(num_heads): + j = i // factor + kv_res[:, i:i + 1, :, :] = kv_tensor[:, j:j + 1, :, :] + return kv_res + + +def calculate_qk(q, k, atten_mask, pse, scale): + if pse is None or len(pse.shape) == 0: + qk = torch.matmul(q, k.permute(0, 1, 3, 2)).mul(scale) + else: + qk = (torch.matmul(q, k.permute(0, 1, 3, 2)) + pse).mul(scale) + if atten_mask is None or len(atten_mask.shape) == 0: + return qk + else: + qk = qk + atten_mask.bool() * (-40000.0) # -10000 + return qk + + +def fusion_attention_forward(q, k, v, drop_mask, atten_mask, pse, scale, keep_prob): + qk = calculate_qk(q, k, atten_mask, pse, scale) + softmax_res, softmax_max, softmax_sum = softmax_forward(qk) + if drop_mask is None or len(drop_mask.shape) == 0: + drop_res = softmax_res + else: + drop_res = softmax_res * drop_mask * (1.0 / keep_prob) + y = torch.matmul(drop_res, v) + return y, softmax_max, softmax_sum + + +def fusion_attention_backward(dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob): + dp = torch.matmul(dx, v.permute(0, 1, 3, 2)) + if drop_mask is None or len(drop_mask.shape) == 0: + drop_res = softmax_res.permute(0, 1, 3, 2) + dp_drop = dp + else: + drop_res = softmax_res.mul(drop_mask).mul(1.0 / keep_prob).permute(0, 1, 3, 2) + dp_drop = dp * drop_mask * (1.0 / keep_prob) + dv = torch.matmul(drop_res, dx) + softmax_grad_res = (softmax_grad(dp_drop, softmax_res) * scale) + dq = torch.matmul(softmax_grad_res, k) + dk = torch.matmul(softmax_grad_res.permute(0, 1, 3, 2), q) + return dq, dk, dv + + +def parse_bsnd_args(query, key, head_num, input_layout): + supported_input_layout = ["BSH", "SBH", "BSND", "BNSD", "TND"] + B, S1, S2, N1, N2, D, H1, H2 = None, None, None, head_num, None, None, None, None + + if not isinstance(input_layout, str) or input_layout not in supported_input_layout: + raise ValueError(f"Invalid input_layout arg which must be one of {supported_input_layout}.") + + if input_layout == "TND": + raise ValueError(f"input_layout {input_layout} does not supported for now.") + try: + if input_layout == "BSH": + B, S1, H1 = query.shape + _, S2, H2 = key.shape + D = H1 // N1 + N2 = H2 // D + elif input_layout == "SBH": + S1, B, H1 = query.shape + S2, _, H2 = key.shape + D = H1 // N1 + N2 = H2 // D + elif input_layout == "BSND": + B, S1, N1, D = query.shape + _, S2, N2, _ = key.shape + H1 = N1 * D + H2 = N2 * D + elif input_layout == "BNSD": + B, N1, S1, D = query.shape + _, N2, S2, _ = key.shape + H1 = N1 * D + H2 = N2 * D + except Exception as e: + raise ValueError(f"query.shape: {query.shape}, key.shape: {key.shape}, parse_bsnd_args error: {e}") from e + + if D == 0: + raise ValueError(f"Value D must be non-zero.") + DTYPE = query.dtype + return B, S1, S2, N1, N2, D, H1, H2, DTYPE + + +def convert_from_bnsd(_input, input_layout): + if input_layout == "BSH": + # (B,N,S,D)=>(B,S,N*D) + out = rearrange(_input, 'b n s d -> b s (n d)').contiguous() + elif input_layout == "SBH": + # (B,N,S,D)=>(S,B,N*D) + out = rearrange(_input, 'b n s d -> s b (n d)').contiguous() + elif input_layout == "BSND": + # (B,N,S,D)=>(B,S,N,D) + out = rearrange(_input, 'b n s d -> b s n d').contiguous() + elif input_layout == "TND": + raise ValueError(f"input_layout {input_layout} does not supported for now.") + else: + out = _input + return out + + +def convert_to_bnsd(_input, n, input_layout): + # 默认"BNSD"无需处理 + if input_layout == "BSH": + # (B,S,N*D)=>(B,N,S,D) + out = rearrange(_input, 'b s (n d) -> b n s d', n=n) + elif input_layout == "SBH": + # (S,B,N*D)=>(B,N,S,D) + out = rearrange(_input, 's b (n d) -> b n s d', n=n) + elif input_layout == "BSND": + # (B,S,N,D)=>(B,N,S,D) + out = rearrange(_input, 'b s n d -> b n s d', n=n) + elif input_layout == "TND": + raise ValueError(f"input_layout {input_layout} does not supported for now.") + else: + out = _input + if out.dim() != 4: + raise ValueError(f"convert qkv format failed with input_layout {input_layout}.") + return out.to(gtype) + + +def generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tocken, next_tocken, dtype): + """ + # 当sparse_mode=2、3、4时小算子到融合算子会走这个优化,反过来看就要拆解回原来的基本实现 + ===> atten_mask = torch.from_numpy(np.triu(np.ones([2048, 2048]), k=1)).to(dtype) + """ + shape = [S1, S2] + + if atten_mask is not None: + # 当FA的输入已经包含atten_mask时,可以认为已经是转换之后的mask矩阵了,有三种特殊场景,即稀疏矩阵场景,需要进行逆向还原 + if sparse_mode == 2 or sparse_mode == 3 or sparse_mode == 4: + logger.info(f"S1: {S1}, S2:{S2}, atten_mask.shape:{atten_mask.shape}, atten_mask.dtype:{atten_mask.dtype}") + + if atten_mask.dim() == 2 and atten_mask.shape[0] == 2048 and atten_mask.shape[1] == 2048: + if atten_mask.equal(torch.from_numpy(np.triu(np.ones([2048, 2048]), k=1)).to(atten_mask.dtype)): + if sparse_mode == 2: + atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=1)) + elif sparse_mode == 3: + atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=S2 - S1 + 1)) + elif sparse_mode == 4: + atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1)) + atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1)) + atten_mask = atten_mask_u + atten_mask_l + logger.debug(f"反向转换atten_mask {atten_mask.shape}") + return atten_mask.to(dtype) + + return atten_mask.to(dtype) + + if atten_mask is not None: + if atten_mask.dim() == 2: + if atten_mask.shape[0] != S1 or atten_mask.shape[1] != S2: + raise ValueError(f"Invalid atten_mask shape `SS` {atten_mask.shape}") + shape = [S1, S2] + elif atten_mask.dim() == 4: + if atten_mask.shape[1] == 1: + shape = [B, 1, S1, S2] if B != 1 else [1, 1, S1, S2] + else: + shape = [B, N1, S1, S2] if B != 1 else [1, N1, S1, S2] + + if sparse_mode == 0: + atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1)) + atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1)) + atten_mask = atten_mask_u + atten_mask_l + elif sparse_mode == 1: # no sparse + atten_mask = torch.from_numpy(np.zeros(shape)) + elif sparse_mode == 2: + atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=1)) + elif sparse_mode == 3: + atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=S2 - S1 + 1)) + elif sparse_mode == 4: + atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1)) + atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1)) + atten_mask = atten_mask_u + atten_mask_l + # 注:不会出现sparse_mode=5的情况,该情况要求必须要传入atten_mask,且atten_mask矩阵数据格式须为BNSS或B1SS, + # 因此可以认为FA的输入已经是正确的atten_mask了 + return atten_mask.to(dtype) + + +def generate_kv(key, value, N1, N2): + # N不等长适配by cdy + if not (N1 == N2): + k_new = broadcast_kv(N1, N2, key, key.dtype) + v_new = broadcast_kv(N1, N2, value, value.dtype) + else: + k_new = key + v_new = value + return k_new, v_new + + +def rebuid_softmax_by_qkv(q, k, atten_mask, pse, scale): + """ + attention = softmax(QK^T/sqrt(d))V + softmax(x_i) = e^(x_i - x_max) / sum(e^(x_i - x_max)) + """ + logger.info("Using QKV to rebuild original softmax") + qk = calculate_qk(q, k, atten_mask, pse, scale) + softmax_res, x_max, x_sum = softmax_forward(qk) + return softmax_res + + +def rebuild_softmax_by_max_sum(q, k, atten_mask, pse, scale, softmax_max, softmax_sum): + """ + attention = softmax(QK^T/sqrt(d))V + softmax(x_i) = e^(x_i - x_max_i) / x_sum_i) + """ + logger.info("Using softmax_max and softmax_sum to rebuild original softmax") + qk = calculate_qk(q, k, atten_mask, pse, scale) + if softmax_max.shape[-1] == 0: + raise ValueError(f"softmax_max.shape[-1] must be non-zero, softmax_max.shape: {softmax_max.shape}") + repeat_dim = qk.shape[-1] // softmax_max.shape[-1] + softmax_res = torch.exp(qk.sub(softmax_max.repeat(1, 1, 1, repeat_dim))).div( + softmax_sum.repeat(1, 1, 1, repeat_dim)) + return softmax_res + + +def npu_fusion_attention_forward_patch(*args, **kwargs): + # query, key, value, head_num, input_layout + if len(args) != 5: + raise ValueError(f"Unsupported npu_fusion_attention args {args}.") + + B, S1, S2, N1, N2, D, H1, H2, DTYPE = parse_bsnd_args(args[0], args[1], args[3], args[4]) + if N1 == N2 and S1 == S2: + logger.debug(f"running case : BNSD = {B}_{N1}_{S1}_{D}, sparse = {kwargs.get('sparse_mode', 0)}") + else: + logger.debug(f"running case: BNSD = {B}_{N1}({N2})_{S1}({S2})_{D}, sparse = {kwargs.get('sparse_mode', 0)}") + if not (N1 % N2 == 0 and N1 >= N2): + raise ValueError(f"N1与N2不匹配,请检查: N1 = {N1}, N2 = {N2}.") + + dims_kwargs = {"B": B, "S1": S1, "S2": S2, "N1": N1, "N2": N2, + "D": D, "H1": H1, "H2": H2, "DTYPE": DTYPE} + + new_kwargs = {"keep_prob": 1, + "scale": kwargs.get("scale", 1 / (D ** 0.5)), + "sparse_mode": kwargs.get("sparse_mode", 0), + "prefix": kwargs.get("prefix"), + "pre_tockens": kwargs.get("pre_tockens", 2147483647), + "next_tockens": kwargs.get("next_tockens", 2147483647), + "pse": kwargs.get("pse"), + "padding_mask": kwargs.get("padding_mask"), + "atten_mask": kwargs.get("atten_mask")} + + return args, dims_kwargs, new_kwargs + + +def npu_fusion_attention_backward_patch(*args, **kwargs): + if len(args) != 6: + raise ValueError(f"Unsupported npu_fusion_attention_grad args {args}.") + + B, S1, S2, N1, N2, D, H1, H2, DTYPE = parse_bsnd_args(args[0], args[1], args[4], args[5]) + if N1 == N2 and S1 == S2: + logger.info(f"running case : BNSD = {B}_{N1}_{S1}_{D}, sparse = {kwargs.get('sparse_mode', 0)}") + else: + logger.info(f"running case: BNSD = {B}_{N1}({N2})_{S1}({S2})_{D}, sparse = {kwargs.get('sparse_mode', 0)}") + if not (N1 % N2 == 0 and N1 >= N2): + raise ValueError(f"N1与N2不匹配,请检查: N1 = {N1}, N2 = {N2}.") + + dims_kwargs = {"B": B, "S1": S1, "S2": S2, "N1": N1, "N2": N2, + "D": D, "H1": H1, "H2": H2, "DTYPE": DTYPE} + + new_kwargs = {"keep_prob": 1, + "scale_value": kwargs.get("scale_value", 1 / (D ** 0.5)), + "sparse_mode": kwargs.get("sparse_mode", 0), + "prefix": kwargs.get("prefix"), + "pre_tockens": kwargs.get("pre_tockens", 2147483647), + "next_tockens": kwargs.get("next_tockens", 2147483647), + "pse": kwargs.get("pse"), + "padding_mask": kwargs.get("padding_mask"), + "softmax_max": kwargs.get("softmax_max"), + "softmax_sum": kwargs.get("softmax_sum"), + "softmax_in": kwargs.get("softmax_in"), + "attention_in": kwargs.get("attention_in"), + "seed": kwargs.get("seed", 0), + "offset": kwargs.get("offset", 0), + "numels": kwargs.get("numels", 0), + "atten_mask": kwargs.get("atten_mask")} + + return args, dims_kwargs, new_kwargs + + +def npu_fusion_attention(*args, **kwargs): + new_args, dims_kwargs, new_kwargs = npu_fusion_attention_forward_patch(*args, **kwargs) + query, key, value, input_layout = new_args[0], new_args[1], new_args[2], new_args[4] + N1 = dims_kwargs.get("N1") + N2 = dims_kwargs.get("N2") + S1 = dims_kwargs.get("S1") + S2 = dims_kwargs.get("S2") + B = dims_kwargs.get("B") + DTYPE = dims_kwargs.get("DTYPE") + atten_mask = new_kwargs.get("atten_mask") + keep_prob = new_kwargs.get("keep_prob") + sparse_mode = new_kwargs.get("sparse_mode") + pre_tockens = new_kwargs.get("pre_tockens") + next_tockens = new_kwargs.get("next_tockens") + pse = new_kwargs.get("pse") + scale = new_kwargs.get("scale") + + atten_mask = generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tockens, next_tockens, DTYPE) + query = convert_to_bnsd(query, N1, input_layout) + key = convert_to_bnsd(key, N2, input_layout) + value = convert_to_bnsd(value, N2, input_layout) + k_new, v_new = generate_kv(key, value, N1, N2) + out_golden, softmax_max, softmax_sum = fusion_attention_forward(q=query, k=k_new, v=v_new, + drop_mask=None, atten_mask=atten_mask, + pse=pse, scale=scale, + keep_prob=keep_prob) + if out_golden.dim() == 5: + out_golden = out_golden.reshape(out_golden.size(0), out_golden.size(1) * out_golden.size(2), out_golden.size(3), + out_golden.size(4)) + out_golden = convert_from_bnsd(out_golden, input_layout) + + return out_golden.cpu(), softmax_max.repeat(1, 1, 1, 8).cpu(), softmax_sum.repeat(1, 1, 1, 8).cpu() + + +def npu_fusion_attention_grad(*args, **kwargs): + # dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob + new_args, dims_kwargs, new_kwargs = npu_fusion_attention_backward_patch(*args, **kwargs) + query, key, value, dx, input_layout = new_args[0], new_args[1], new_args[2], new_args[3], new_args[5] + N1 = dims_kwargs.get("N1") + N2 = dims_kwargs.get("N2") + S1 = dims_kwargs.get("S1") + S2 = dims_kwargs.get("S2") + B = dims_kwargs.get("B") + D = dims_kwargs.get("D") + DTYPE = dims_kwargs.get("DTYPE") + atten_mask = new_kwargs.get("atten_mask") + keep_prob = new_kwargs.get("keep_prob") + sparse_mode = new_kwargs.get("sparse_mode") + pre_tockens = new_kwargs.get("pre_tockens") + next_tockens = new_kwargs.get("next_tockens") + pse = new_kwargs.get("pse") + softmax_max = new_kwargs.get("softmax_max") + softmax_sum = new_kwargs.get("softmax_sum") + scale_value = new_kwargs.get("scale_value") + + atten_mask = generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tockens, next_tockens, DTYPE) + query = convert_to_bnsd(query, N1, input_layout) + dx = convert_to_bnsd(dx, N1, input_layout) + key = convert_to_bnsd(key, N2, input_layout) + value = convert_to_bnsd(value, N2, input_layout) + k_new, v_new = generate_kv(key, value, N1, N2) + + if softmax_build_mode == "QKV": + softmax_res = rebuid_softmax_by_qkv(query, k_new, atten_mask, pse, scale_value) + else: + softmax_res = rebuild_softmax_by_max_sum(query, k_new, atten_mask, pse, scale_value, softmax_max, softmax_sum) + + dq, dk, dv = fusion_attention_backward(dx, query, k_new, v_new, softmax_res, None, pse, scale_value, keep_prob) + + # N不等长适配by cdy + if not (N1 == N2): + if N2 == 0: + raise ValueError("dims_kwargs.N2 must be non-zero.") + G = int(N1 / N2) + dk = torch.sum(dk.reshape(B, N2, G, S2, D), dim=2, keepdim=True).reshape(B, N2, S2, D) + dv = torch.sum(dv.reshape(B, N2, G, S2, D), dim=2, keepdim=True).reshape(B, N2, S2, D) + + if dq.dim() == 5: + dq = dq.reshape(dq.size(0), dq.size(1) * dq.size(2), dq.size(3), dq.size(4)) + if dk.dim() == 5: + dk = dk.reshape(dk.size(0), dk.size(1) * dk.size(2), dk.size(3), dk.size(4)) + if dv.dim() == 5: + dv = dv.reshape(dv.size(0), dv.size(1) * dv.size(2), dv.size(3), dv.size(4)) + + dq = convert_from_bnsd(dq, input_layout) + dk = convert_from_bnsd(dk, input_layout) + dv = convert_from_bnsd(dv, input_layout) + + return dq.cpu(), dk.cpu(), dv.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/rms_norm.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/rms_norm.py new file mode 100644 index 00000000000..e647312fdb2 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/rms_norm.py @@ -0,0 +1,15 @@ +import torch + + +def npu_rms_norm(x, gamma, epsilon=1e-5): + rstd = torch.rsqrt(torch.mean(torch.pow(x, 2), axis=-1, keepdim=True) + epsilon) + res = x * rstd * gamma + return res.cpu(), rstd.float().cpu() + + +def npu_rms_norm_backward(grad, x, gamma, rstd): + mean_gy = (grad * x * gamma * rstd).mean(dim=-1, keepdim=True) + grad_x = (grad * gamma - x * rstd * mean_gy) * rstd + grad_gamma = x * grad * rstd + return grad_x.cpu(), grad_gamma.cpu() + diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/rotary_mul.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/rotary_mul.py new file mode 100644 index 00000000000..0e0fda5f73f --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/rotary_mul.py @@ -0,0 +1,52 @@ +import torch + + +def npu_rotary_mul(x, r1, r2): + x1, x2 = torch.chunk(x, 2, -1) + x_new = torch.cat((-x2, x1), dim=-1) + output = r1 * x + r2 * x_new + return output.cpu() + + +def npu_rotary_mul_backward(dy_tensor, x, r1, r2): + x.requires_grad = True + r1.requires_grad = True + r2.requires_grad = True + # golden + x1, x2 = torch.chunk(x, 2, -1) + x_new = torch.cat((-x2, x1), dim=-1) + golden_tensor = r1 * x + r2 * x_new + golden_tensor.backward(dy_tensor) + r1_shape = r1.shape + r1_grad = torch.zeros(r1_shape).type(torch.float32) + r2_grad = torch.zeros(r1_shape).type(torch.float32) + x1, x2 = torch.chunk(x.float(), 2, -1) + x_new2 = torch.cat((-x2, x1), dim=-1) + x_shape = x.shape + h = x.float() + grad = dy_tensor.float() + condition_1 = (((r1_shape[0] == 1 and x_shape[0] != 1) or (r1_shape[0] == 1 and x_shape[0] == 1)) and + ((r1_shape[2] == 1 and x_shape[2] != 1) or (r1_shape[2] == 1 and x_shape[2] == 1)) and + (r1_shape[1] == x_shape[1]) and (r1_shape[3] == x_shape[3])) + condition_2 = (((r1_shape[0] == 1 and x_shape[0] != 1) or (r1_shape[0] == 1 and x_shape[0] == 1)) and + ((r1_shape[1] == 1 and x_shape[1] != 1) or (r1_shape[1] == 1 and x_shape[1] == 1)) and + (r1_shape[2] == x_shape[2]) and (r1_shape[3] == x_shape[3])) + condition_3 = (((r1_shape[2] == 1 and x_shape[2] != 1) or (r1_shape[2] == 1 and x_shape[2] == 1)) and + ((r1_shape[1] == 1 and x_shape[1] != 1) or (r1_shape[1] == 1 and x_shape[1] == 1)) and + (r1_shape[0] == x_shape[0]) and (r1_shape[3] == x_shape[3])) + if condition_1: + for i in range(x_shape[0]): + for j in range(x_shape[2]): + r2_grad[0, :, 0, :] += (x_new2[i, :, j, :] * grad[i, :, j, :]) + r1_grad[0, :, 0, :] += (h[i, :, j, :] * grad[i, :, j, :]) + elif condition_2: + for i in range(x_shape[0]): + for j in range(x_shape[1]): + r2_grad[0, 0, :, :] += (x_new2[i, j, :, :] * grad[i, j, :, :]) + r1_grad[0, 0, :, :] += (h[i, j, :, :] * grad[i, j, :, :]) + elif condition_3: + for i in range(x_shape[1]): + for j in range(x_shape[2]): + r2_grad[:, 0, 0, :] += (x_new2[:, i, j, :] * grad[:, i, j, :]) + r1_grad[:, 0, 0, :] += (h[:, i, j, :] * grad[:, i, j, :]) + return x.grad.cpu(), r1_grad.cpu(), r2_grad.cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/scaled_mask_softmax.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/scaled_mask_softmax.py new file mode 100644 index 00000000000..8717aebaf90 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/scaled_mask_softmax.py @@ -0,0 +1,26 @@ +import torch + + +def npu_scaled_masked_softmax(x, mask, scale, fixed_triu_mask): + if fixed_triu_mask: + mask = (torch.triu(torch.ones(mask.shape), k=1)).bool().to(mask.device) + dtype = x.dtype + x = (x * scale).masked_fill(mask, value=-10000) + x = x - torch.max(x, dim=-1, keepdims=True)[0] + x = torch.exp(x.float()) + y = torch.div(x, torch.sum(x, dim=-1, keepdims=True)) + return y.to(dtype).cpu() + + +def npu_scaled_masked_softmax_backward(y_grad, y, mask, scale, fixed_triu_mask): + if fixed_triu_mask: + mask = (torch.triu(torch.ones(mask.shape), k=1)).bool().to(mask.device) + dtype = y_grad.dtype + y_grad = y_grad.float() + y = y.float() + x_grad = y_grad * y + x_grad = y_grad - torch.sum(x_grad, dim=-1, keepdims=True) + x_grad = x_grad * y + x_grad = x_grad * scale + x_grad = x_grad.masked_fill(mask, value=0) + return x_grad.to(dtype).cpu() diff --git a/debug/accuracy_tools/msprobe/pytorch/bench_functions/swiglu.py b/debug/accuracy_tools/msprobe/pytorch/bench_functions/swiglu.py new file mode 100644 index 00000000000..e03c975a50a --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/bench_functions/swiglu.py @@ -0,0 +1,55 @@ +import torch + + +def npu_swiglu(x, dim=-1): + tensor_dtype = x.dtype + + inTensors = torch.chunk(x, 2, dim=dim) + if tensor_dtype == torch.float32: + tensor_scalar = torch.sigmoid(torch.mul(inTensors[0], 1.0)) + output_data = torch.mul(torch.mul(tensor_scalar, inTensors[0]), inTensors[1]) + else: + tensor_self_float = inTensors[0].type(torch.float) + tensor_other_float = inTensors[1].type(torch.float) + tensor_out_float = torch.nn.functional.silu(tensor_self_float).type(tensor_dtype).type( + torch.float32) * tensor_other_float + output_data = tensor_out_float.type(tensor_dtype) + return output_data.cpu() + + +def npu_swiglu_backward(grad, x, dim=-1): + tensor_dtype = grad.dtype + in_tensors = torch.chunk(x, 2, dim=dim) + tensor_grad_out = grad + + if tensor_dtype == torch.float16: + tensor_out1 = torch.mul( + torch.mul(in_tensors[1].type(torch.float32), swish_grad(1, in_tensors[0].type(torch.float32))), + tensor_grad_out.type(torch.float32)).type(torch.float16) + tensor_out2 = torch.mul(tensor_grad_out.type(torch.float32), + swish(1, in_tensors[0].type(torch.float32))).type(torch.float16) + output = torch.cat((tensor_out1, tensor_out2), dim) + elif tensor_dtype == torch.bfloat16: + tensor_self_float = in_tensors[0].type(torch.float) + tensor_other_float = in_tensors[1].type(torch.float) + tensor_gradout_float = tensor_grad_out.type(torch.float) + + tensor_out1 = torch.mul(tensor_gradout_float, swish_grad(1.0, tensor_self_float)).type(torch.bfloat16).type( + torch.float32) * tensor_other_float + tensor_out2 = swish(1.0, tensor_self_float).type(torch.bfloat16).type(torch.float32) * tensor_gradout_float + tensor_out_float = torch.cat((tensor_out1, tensor_out2), dim=dim) + output = tensor_out_float.type(torch.bfloat16) + else: + tensor_out1 = torch.mul(torch.mul(in_tensors[1], swish_grad(1.0, in_tensors[0])), tensor_grad_out) + tensor_out2 = torch.mul(tensor_grad_out, swish(1.0, in_tensors[0])) + output = torch.cat((tensor_out1, tensor_out2), dim) + return output.cpu() + + +def swish_grad(beta, x): + return torch.sigmoid(beta * x) + x * (1 - torch.sigmoid(beta * x)) * torch.sigmoid(beta * x) * beta + + +def swish(beta, x): + return x * torch.sigmoid(beta * x) + diff --git a/debug/accuracy_tools/atat/pytorch/common/__init__.py b/debug/accuracy_tools/msprobe/pytorch/common/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/common/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/common/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/common/compare_script.template b/debug/accuracy_tools/msprobe/pytorch/common/compare_script.template similarity index 100% rename from debug/accuracy_tools/atat/pytorch/common/compare_script.template rename to debug/accuracy_tools/msprobe/pytorch/common/compare_script.template diff --git a/debug/accuracy_tools/atat/pytorch/common/log.py b/debug/accuracy_tools/msprobe/pytorch/common/log.py similarity index 81% rename from debug/accuracy_tools/atat/pytorch/common/log.py rename to debug/accuracy_tools/msprobe/pytorch/common/log.py index e496e9b72ad..cea518fa47b 100644 --- a/debug/accuracy_tools/atat/pytorch/common/log.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/log.py @@ -1,9 +1,9 @@ import os import time import sys -from atat.pytorch.common.utils import get_rank_if_initialized -from atat.core.common.log import BaseLogger -from atat.core.common.exceptions import DistributedNotInitializedError +from msprobe.pytorch.common.utils import get_rank_if_initialized +from msprobe.core.common.log import BaseLogger +from msprobe.core.common.exceptions import DistributedNotInitializedError class PyTorchLogger(BaseLogger): diff --git a/debug/accuracy_tools/atat/pytorch/common/parse_json.py b/debug/accuracy_tools/msprobe/pytorch/common/parse_json.py similarity index 89% rename from debug/accuracy_tools/atat/pytorch/common/parse_json.py rename to debug/accuracy_tools/msprobe/pytorch/common/parse_json.py index a938f5f0da9..ccad903724c 100644 --- a/debug/accuracy_tools/atat/pytorch/common/parse_json.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/parse_json.py @@ -1,5 +1,7 @@ import json -from atat.core.common.exceptions import ParseJsonException + +from msprobe.core.common.exceptions import ParseJsonException +from msprobe.core.common.file_check import FileOpen def parse_json_info_forward_backward(json_path): @@ -11,7 +13,7 @@ def parse_json_info_forward_backward(json_path): api_name = '.'.join(name_struct[:-1]) return api_name - with open(json_path, 'r') as f: + with FileOpen(json_path, 'r') as f: dump_json = json.load(f) real_data_path = dump_json.get("dump_data_dir") diff --git a/debug/accuracy_tools/atat/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py similarity index 87% rename from debug/accuracy_tools/atat/pytorch/common/utils.py rename to debug/accuracy_tools/msprobe/pytorch/common/utils.py index 4b413ac5750..181491488f9 100644 --- a/debug/accuracy_tools/atat/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -14,13 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """ +import logging import os import random import stat import torch +import torch.distributed as dist import numpy as np from functools import wraps -from atat.core.common.exceptions import DistributedNotInitializedError +from msprobe.core.common.exceptions import DistributedNotInitializedError try: import torch_npu @@ -29,7 +31,6 @@ except ImportError: else: is_gpu = False - torch_without_guard_version_list = ['2.1', '2.2'] for version in torch_without_guard_version_list: if torch.__version__.startswith(version): @@ -222,3 +223,36 @@ class Const: CONVERT_API = { "int32_to_int64": ["cross_entropy"] } + + +def get_tensor_rank(in_feat, out_feat): + if dist.is_initialized(): + return dist.get_rank() + + def get_tensor_rank_single(x): + if isinstance(x, (list, tuple)): + if len(x) > 0: + return get_tensor_rank_single(x[0]) + elif isinstance(x, torch.Tensor): + device = x.device + if device.type != 'cpu': + return device.index + return None + + in_rank = get_tensor_rank_single(in_feat) + out_rank = get_tensor_rank_single(out_feat) + tensor_rank = in_rank if in_rank else out_rank + return tensor_rank + + +def _create_logger(level=logging.INFO): + logger_ = logging.getLogger() + logger_.setLevel(level) + ch = logging.StreamHandler() + ch.setLevel(level) + logger_.addHandler(ch) + return logger_ + + +log_level = logging.DEBUG if os.environ.get("API_ACCURACY_CHECK_LOG_LEVEL") == "1" else logging.INFO +logger = _create_logger(log_level) diff --git a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py similarity index 95% rename from debug/accuracy_tools/atat/pytorch/compare/acc_compare.py rename to debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py index 061c9cdfca8..2a68c756ed3 100644 --- a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py @@ -27,15 +27,17 @@ from openpyxl.styles import PatternFill from collections import namedtuple from dataclasses import dataclass -from atat.pytorch.compare.match import graph_mapping -from atat.pytorch.compare.highlight import HighlightRules, get_header_index -from atat.pytorch.compare.npy_compare import compare_ops_apply, get_error_type, reshape_value, get_relative_err, get_error_message -from atat.pytorch.advisor.advisor import Advisor -from atat.pytorch.common.log import logger -from atat.core.common.utils import check_compare_param, add_time_with_xlsx, CompareException, \ +from msprobe.pytorch.compare.match import graph_mapping +from msprobe.pytorch.compare.highlight import HighlightRules, get_header_index +from msprobe.pytorch.compare.npy_compare import compare_ops_apply, get_error_type, reshape_value, get_relative_err, \ + get_error_message +from msprobe.pytorch.advisor.advisor import Advisor +from msprobe.pytorch.common.log import logger +from msprobe.core.common.utils import check_compare_param, add_time_with_xlsx, CompareException, \ format_value, check_file_not_exists, check_configuration_param, task_dumppath_get -from atat.core.common.file_check import FileChecker, change_mode, FileOpen, create_directory -from atat.core.common.const import Const, CompareConst, FileCheckConst +from msprobe.core.common.file_check import FileChecker, change_mode, FileOpen, create_directory +from msprobe.core.common.const import Const, CompareConst, FileCheckConst +from msprobe.core.common.exceptions import FileCheckException def check_graph_mode(a_op_name, b_op_name): @@ -490,6 +492,10 @@ def compare_by_op(op_name, op_name_mapping_dict, input_parma): error_file = error.filename n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE error_flag = True + except FileCheckException: + error_file = data_name + n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE + error_flag = True n_value, b_value, error_flag = get_error_type(n_value, b_value, error_flag) if not error_flag: @@ -525,8 +531,10 @@ def handle_inf_nan(n_value, b_value): return n_value, b_value -def find_error_rows(result, last_len, n_num_input, highlight_dict, summary_compare=False): +def find_error_rows(result, last_len, n_num_input, highlight_dict, summary_compare=False, md5_compare=False): """找到单个API中需要高亮的行""" + if md5_compare: + return npu_max_index = get_header_index('NPU max', summary_compare) bench_max_index = get_header_index('Bench max', summary_compare) max_diff_index = get_header_index('Max diff' if summary_compare else 'MaxAbsErr', summary_compare) @@ -582,7 +590,7 @@ def get_name_and_state(name): return api_name, state -def find_compare_result_error_rows(result_df, highlight_dict, summary_compare): +def find_compare_result_error_rows(result_df, highlight_dict, summary_compare, md5_compare): """将dataframe根据API分组,并找到有误差的算子用于高亮""" result = result_df.values start, input_num, output_num, end = 0, 0, 0, len(result_df) @@ -600,7 +608,7 @@ def find_compare_result_error_rows(result_df, highlight_dict, summary_compare): else: output_num = num find_error_rows(result[start:start + input_num + output_num], start, input_num, highlight_dict, - summary_compare) + summary_compare, md5_compare) num, last_api_name, last_state = 1, api_name, state start += input_num + output_num input_num, output_num = 1, 0 @@ -611,7 +619,7 @@ def find_compare_result_error_rows(result_df, highlight_dict, summary_compare): input_num = num else: output_num = num - find_error_rows(result[start:start + input_num + output_num], start, input_num, highlight_dict, summary_compare) + find_error_rows(result[start:start + input_num + output_num], start, input_num, highlight_dict, summary_compare, md5_compare) def highlight_rows_xlsx(result_df, highlight_dict, file_path): @@ -637,7 +645,11 @@ def highlight_rows_xlsx(result_df, highlight_dict, file_path): elif (i - 2) in highlight_dict['yellow_rows']: ws.cell(row=i, column=j).fill = PatternFill(start_color=CompareConst.YELLOW, end_color=CompareConst.YELLOW, fill_type="solid") - wb.save(file_path) + try: + wb.save(file_path) + except Exception as e: + logger.error('Save result file failed') + raise CompareException(CompareException.WRITE_FILE_ERROR) from e change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) @@ -647,8 +659,8 @@ def compare(input_parma, output_path, stack_mode=False, auto_analyze=True, summary_compare, md5_compare = task_dumppath_get(input_parma) check_configuration_param(stack_mode, auto_analyze, fuzzy_match) create_directory(output_path) - check_compare_param(input_parma, output_path, stack_mode, summary_compare, md5_compare) - except CompareException as error: + check_compare_param(input_parma, output_path, summary_compare, md5_compare) + except (CompareException, FileCheckException) as error: logger.error('Compare failed. Please check the arguments and do it again!') sys.exit(error.code) compare_core(input_parma, output_path, stack_mode=stack_mode, @@ -696,7 +708,7 @@ def compare_core(input_parma, output_path, **kwargs): if not md5_compare and not summary_compare: result_df = _do_multi_process(input_parma, result_df) - find_compare_result_error_rows(result_df, highlight_dict, summary_compare) + find_compare_result_error_rows(result_df, highlight_dict, summary_compare, md5_compare) highlight_rows_xlsx(result_df, highlight_dict, file_path) if auto_analyze: advisor = Advisor(result_df, output_path) @@ -738,7 +750,7 @@ def parse(pkl_file, module_name_prefix): logger.info(summary_info) -def op_item_parse(item, op_name, index, item_list=[], top_bool=True): +def op_item_parse(item, op_name, index, item_list=None, top_bool=True): if item_list is None: item_list = [] if item is None or (isinstance(item, dict) and not item): @@ -756,9 +768,14 @@ def op_item_parse(item, op_name, index, item_list=[], top_bool=True): else: full_op_name = op_name else: - full_op_name = op_name + '.' + str(index) + full_op_name = op_name + Const.SEP + str(index) if isinstance(item, dict): - if 'dtype' in item: + if 'type' not in item: + for kwarg in item: + kwarg_parsed_list = op_item_parse(item[kwarg], op_name + Const.SEP + kwarg, None) + item_list += kwarg_parsed_list + kwarg_parsed_list.clear() + elif 'dtype' in item: parsed_item = item parsed_item['full_op_name'] = full_op_name item_list.append(parsed_item) @@ -800,8 +817,8 @@ def op_item_parse(item, op_name, index, item_list=[], top_bool=True): else: resolve_api_special_parameters(item, full_op_name, item_list) else: - for j in range(len(item)): - op_item_parse(item[j], full_op_name, j, item_list=item_list, top_bool=False) + for j, item_spec in enumerate(item): + op_item_parse(item_spec, full_op_name, j, item_list=item_list, top_bool=False) return item_list @@ -861,13 +878,13 @@ def read_op(op_data, op_name): op_parsed_list += output_parsed_list output_parsed_list.clear() if 'backward' in op_name: - if 'grad_input' in op_data: - input_item = op_data['grad_input'] + if 'input' in op_data: + input_item = op_data['input'] input_parsed_list = op_item_parse(input_item, op_name + '_input', None) op_parsed_list = input_parsed_list.copy() input_parsed_list.clear() - if 'grad_output' in op_data: - output_item = op_data['grad_output'] + if 'output' in op_data: + output_item = op_data['output'] output_parsed_list = op_item_parse(output_item, op_name + '_output', None) op_parsed_list += output_parsed_list output_parsed_list.clear() diff --git a/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py similarity index 85% rename from debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py rename to debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py index b89adc1581e..caac1395807 100644 --- a/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py @@ -17,11 +17,12 @@ import os import sys import re -from atat.core.common.utils import CompareException, check_compare_param, \ +from msprobe.core.common.utils import CompareException, check_compare_param, \ check_configuration_param, task_dumppath_get, check_file_or_directory_path, check_regex_prefix_format_valid -from atat.pytorch.compare.acc_compare import compare_core -from atat.core.common.file_check import create_directory -from atat.pytorch.common.log import logger +from msprobe.pytorch.compare.acc_compare import compare_core +from msprobe.core.common.file_check import create_directory +from msprobe.core.common.exceptions import FileCheckException +from msprobe.pytorch.common.log import logger def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): @@ -86,12 +87,11 @@ def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): 'or use compare() api and manually match the ranks.') raise CompareException(CompareException.INVALID_PATH_ERROR) for nr, br in zip(npu_ranks, bench_ranks): - n_dir = os.path.join(npu_dump_dir, nr) - b_dir = os.path.join(bench_dump_dir, br) - s_dir = b_dir - npu_json_path = extract_json(n_dir, stack_json=False) - bench_json_path = extract_json(b_dir, stack_json=False) - stack_json_path = extract_json(s_dir, stack_json=True) + npu_data_dir = os.path.join(npu_dump_dir, nr) + bench_data_dir = os.path.join(bench_dump_dir, br) + npu_json_path = extract_json(npu_data_dir, stack_json=False) + bench_json_path = extract_json(bench_data_dir, stack_json=False) + stack_json_path = extract_json(npu_data_dir, stack_json=True) dump_result_param = { 'npu_json_path': npu_json_path, @@ -103,8 +103,8 @@ def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): summary_compare, md5_compare = task_dumppath_get(dump_result_param) check_configuration_param(stack_mode, auto_analyze, fuzzy_match) create_directory(output_path) - check_compare_param(dump_result_param, output_path, stack_mode=stack_mode, summary_compare=summary_compare) - except CompareException as error: + check_compare_param(dump_result_param, output_path, summary_compare=summary_compare, md5_compare=md5_compare) + except (CompareException, FileCheckException) as error: logger.error('Compare failed. Please check the arguments and do it again!') sys.exit(error.code) compare_core(dump_result_param, output_path, suffix=f'_{nr}-{br}', summary_compare=summary_compare, diff --git a/debug/accuracy_tools/atat/pytorch/compare/highlight.py b/debug/accuracy_tools/msprobe/pytorch/compare/highlight.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/compare/highlight.py rename to debug/accuracy_tools/msprobe/pytorch/compare/highlight.py index 3a6898dedbb..82f0022f8b5 100644 --- a/debug/accuracy_tools/atat/pytorch/compare/highlight.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/highlight.py @@ -1,8 +1,8 @@ import math import abc import numpy as np -from atat.core.common.utils import get_header_index -from atat.core.common.const import CompareConst +from msprobe.core.common.utils import get_header_index +from msprobe.core.common.const import CompareConst class HighlightCheck(abc.ABC): diff --git a/debug/accuracy_tools/atat/pytorch/compare/mapping.yaml b/debug/accuracy_tools/msprobe/pytorch/compare/mapping.yaml similarity index 100% rename from debug/accuracy_tools/atat/pytorch/compare/mapping.yaml rename to debug/accuracy_tools/msprobe/pytorch/compare/mapping.yaml diff --git a/debug/accuracy_tools/atat/pytorch/compare/match.py b/debug/accuracy_tools/msprobe/pytorch/compare/match.py similarity index 91% rename from debug/accuracy_tools/atat/pytorch/compare/match.py rename to debug/accuracy_tools/msprobe/pytorch/compare/match.py index 148fbb7d640..6347d8887c8 100644 --- a/debug/accuracy_tools/atat/pytorch/compare/match.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/match.py @@ -1,7 +1,7 @@ import os import yaml -from atat.core.common.file_check import FileOpen -from atat.core.common.utils import CompareException +from msprobe.core.common.file_check import FileOpen +from msprobe.core.common.utils import CompareException class AtenIrMapping(): diff --git a/debug/accuracy_tools/atat/pytorch/compare/npy_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/npy_compare.py similarity index 98% rename from debug/accuracy_tools/atat/pytorch/compare/npy_compare.py rename to debug/accuracy_tools/msprobe/pytorch/compare/npy_compare.py index 0cf4c6c00a0..5a0feb4cd4a 100644 --- a/debug/accuracy_tools/atat/pytorch/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/npy_compare.py @@ -1,8 +1,8 @@ import abc import numpy as np -from atat.core.common.utils import format_value -from atat.core.common.const import Const, CompareConst -from atat.pytorch.common.log import logger +from msprobe.core.common.utils import format_value +from msprobe.core.common.const import Const, CompareConst +from msprobe.pytorch.common.log import logger def handle_inf_nan(n_value, b_value): diff --git a/debug/accuracy_tools/atat/pytorch/debugger/__init__.py b/debug/accuracy_tools/msprobe/pytorch/debugger/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/debugger/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/debugger/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py similarity index 89% rename from debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py rename to debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py index 1ad69701e41..f1289e9b013 100644 --- a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py +++ b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py @@ -1,6 +1,6 @@ -from atat.pytorch.common import seed_all -from atat.pytorch.common.log import logger -from atat.core.common.const import Const +from msprobe.pytorch.common import seed_all +from msprobe.pytorch.common.log import logger +from msprobe.core.common.const import Const class DebuggerConfig: @@ -21,7 +21,7 @@ class DebuggerConfig: self.acl_config = common_config.acl_config if common_config.acl_config else "" self.is_forward_acl_dump = True self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS - self.overflow_num = task_config.overflow_num if task_config.overflow_num else 1 + self.overflow_nums = task_config.overflow_nums if task_config.overflow_nums else 1 self.framework = Const.PT_FRAMEWORK if self.task == Const.FREE_BENCHMARK: @@ -46,9 +46,8 @@ class DebuggerConfig: raise ValueError("backward_input must be configured when scope contains 'backward'") if Const.BACKWARD in self.scope[0]: self.is_forward_acl_dump = False - for index in range(len(self.scope)): - # Do this replace operation to let the acl backward dump can be done in forward hook. - self.scope[index] = self.scope[index].replace(Const.BACKWARD, Const.FORWARD) + for index, scope_spec in enumerate(self.scope): + self.scope[index] = scope_spec.replace(Const.BACKWARD, Const.FORWARD) self.backward_input[self.scope[index]] = self.backward_input_list[index] seed_all(self.seed, self.is_deterministic) diff --git a/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/pytorch/debugger/precision_debugger.py similarity index 77% rename from debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py rename to debug/accuracy_tools/msprobe/pytorch/debugger/precision_debugger.py index 140d829bedc..6119bbd1d4f 100644 --- a/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py +++ b/debug/accuracy_tools/msprobe/pytorch/debugger/precision_debugger.py @@ -1,10 +1,10 @@ import torch from torch.utils.data import dataloader -from atat.pytorch.debugger.debugger_config import DebuggerConfig -from atat.pytorch.service import Service -from atat.pytorch.common.log import logger -from atat.pytorch.pt_config import parse_json_config -from atat.core.common.exceptions import MsaccException +from msprobe.pytorch.debugger.debugger_config import DebuggerConfig +from msprobe.pytorch.service import Service +from msprobe.pytorch.common.log import logger +from msprobe.pytorch.pt_config import parse_json_config +from msprobe.core.common.exceptions import MsprobeException class PrecisionDebugger: @@ -27,6 +27,7 @@ class PrecisionDebugger: step=None, ): if not hasattr(self, "initialized"): + self.api_origin = False self.initialized = True self.model = self.check_model_valid(model) common_config, task_config = parse_json_config(config_path, task) @@ -46,6 +47,14 @@ class PrecisionDebugger: def instance(self): return self._instance + @staticmethod + def check_model_valid(model): + if not model or isinstance(model, torch.nn.Module): + return model + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, "model 参数必须是torch.nn.Module类型。" + ) + @classmethod def start(cls): instance = cls._instance @@ -54,7 +63,15 @@ class PrecisionDebugger: if instance.enable_dataloader: logger.warning_on_rank_0("DataLoader is enabled, start() skipped.") else: - instance.service.start(instance.model) + instance.service.start(instance.model, instance.api_origin) + instance.api_origin = False + + # 指定代码段dump前反向结束符,之后的计算过程数据将被忽略,无法被dump + @classmethod + def forward_backward_dump_end(cls): + instance = cls._instance + instance.service.forward_backward_dump_end() + instance.api_origin = True @classmethod def stop(cls): @@ -72,14 +89,6 @@ class PrecisionDebugger: raise Exception("PrecisionDebugger instance is not created.") cls._instance.service.step() - @staticmethod - def check_model_valid(model): - if not model or isinstance(model, torch.nn.Module): - return model - raise MsaccException( - MsaccException.INVALID_PARAM_ERROR, "model 参数必须是torch.nn.Module类型。" - ) - def iter_tracer(func): def func_wrapper(*args, **kwargs): diff --git a/debug/accuracy_tools/atat/pytorch/doc/FAQ.md b/debug/accuracy_tools/msprobe/pytorch/doc/FAQ.md similarity index 72% rename from debug/accuracy_tools/atat/pytorch/doc/FAQ.md rename to debug/accuracy_tools/msprobe/pytorch/doc/FAQ.md index 19a434a1946..8d12a72928e 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/FAQ.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/FAQ.md @@ -22,15 +22,15 @@ 6. 添加预检工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。 - 答:注释工具目录mstt/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + 答:注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 7. 添加预检工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。 - 答:注释工具目录mstt/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + 答:注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 8. 添加预检工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。 - 答:注释工具目录mstt/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。 + 答:注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。 9. Tensor 魔法函数具体对应什么操作? @@ -75,7 +75,7 @@ ### dump指定融合算子 -dump指定操作当前支持dump指定融合算子的输入输出,需要在mstt/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml中添加,比如以下代码段调用的softmax融合算子 +dump指定操作当前支持dump指定融合算子的输入输出,需要在mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml中添加,比如以下代码段调用的softmax融合算子 ``` def npu_forward_fused_softmax(self, input_, mask): @@ -111,7 +111,7 @@ torch版本和硬件差异属于正常情况。 **故障现象** -使用atat工具时,报错: error code: EI0006。 +使用msprobe工具时,报错: error code: EI0006。 **故障原因** @@ -136,7 +136,7 @@ torch.npu.set_device('npu:0') torch.npu.set_device(f'npu:{rank}') ``` -如果运行精度比对功能遇到这个报错,尝试安装最新版本的atat。 +如果运行精度比对功能遇到这个报错,尝试安装最新版本的msprobe。 ### 4. dump得到的VF_lstm_99_forward_input.1.0.npy、VF_lstm_99_forward_input.1.1.npy类似的数据是否正常? @@ -147,7 +147,7 @@ torch.npu.set_device(f'npu:{rank}') 在比对脚本中,设置stack_mode=True,例如: ``` -from atat.pytorch import compare +from msprobe.pytorch import compare dump_result_param={ "npu_json_path": "./npu_dump/dump.json", "bench_json_path": "./gpu_dump/dump.json", @@ -174,20 +174,20 @@ compare(dump_result_param, output_path="./output", stack_mode=True) ### 9. dump.json文件中的某些api的dtype类型为float16,但是读取此api的npy文件显示的dtype类型为float32 -- atat工具在dump数据时需要将原始数据从npu to cpu上再转换为numpy类型,npu to cpu的逻辑和gpu to cpu是保持一致的,都存在dtype可能从float16变为float32类型的情况,如果出现dtype不一致的问题,最终dump数据的dtype以pkl文件为准。 +- msprobe工具在dump数据时需要将原始数据从npu to cpu上再转换为numpy类型,npu to cpu的逻辑和gpu to cpu是保持一致的,都存在dtype可能从float16变为float32类型的情况,如果出现dtype不一致的问题,最终dump数据的dtype以pkl文件为准。 -### 10. 使用dataloader后raise异常Exception("atat: exit after iteration {}". format(max(self.config.step)) +### 10. 使用dataloader后raise异常Exception("msprobe: exit after iteration {}". format(max(self.config.step)) - 正常现象,dataloader通过raise结束程序,堆栈信息可忽略。 -### 11. 添加atat工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。 +### 11. 添加msprobe工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。 -- 注释工具目录mstt/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 +- 注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 -### 12. 添加atat工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。 +### 12. 添加msprobe工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。 -- 注释工具目录mstt/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置api也可以考虑根据报错堆栈信息注释引发报错的类型检查。 +- 注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置api也可以考虑根据报错堆栈信息注释引发报错的类型检查。 -### 13. 添加atat工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。 +### 13. 添加msprobe工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。 -- 注释工具目录mstt/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。 +- 注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。 diff --git a/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md b/debug/accuracy_tools/msprobe/pytorch/doc/api_accuracy_checker.md similarity index 84% rename from debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md rename to debug/accuracy_tools/msprobe/pytorch/doc/api_accuracy_checker.md index 0e45a4e83fb..41b97098ae9 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/api_accuracy_checker.md @@ -20,8 +20,8 @@ 精度预检操作流程如下: -1. 在NPU和GPU环境下分别安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 -2. 在NPU训练脚本内添加atat工具dump接口PrecisionDebugger采集待预检数据。详见《[精度数据采集](./dump.md)》。 +1. 在NPU和GPU环境下分别安装msprobe工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 +2. 在NPU训练脚本内添加msprobe工具dump接口PrecisionDebugger,采集待预检数据。详见《[精度数据采集](./dump.md)》,注意需要配置level="L1"。 3. 将NPU环境下dump的预检数据拷贝至GPU环境。 4. 在NPU和GPU环境下分别执行run_ut,生成结果用于最终api_precision_compare操作的输入。详见“**run_ut预检操作**”。 5. 将NPU和GPU执行run_ut生成的`accuracy_checking_details_{timestamp}.csv`结果文件拷贝至同一环境下。 @@ -43,7 +43,7 @@ run_ut预检操作包括如下场景: 1. 将API信息输入给run_ut模块运行精度检测并比对,运行如下命令: ```bash - atat -f pytorch run_ut -api_info ./dump.json + msprobe -f pytorch run_ut -api_info ./dump.json ``` | 参数名称 | 说明 | 是否必选 | @@ -51,20 +51,22 @@ run_ut预检操作包括如下场景: | -api_info或--api_info_file | 指定API信息文件dump.json。 | 是 | | -save_error_data | 保存精度未达标的API输入输出数据。 | 否 | | -o或--out_path | 指定run_ut执行结果存盘路径,默认“./”(相对于run_ut的路径)。 | 否 | + | | | | | -j或--jit_compile | 开启jit编译。 | 否 | | -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0。 | 否 | | -csv_path或--result_csv_path | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径,执行run_ut中断时,若想从中断处继续执行,配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 | | -f或--filter_api | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否 | + | -config或--config_path | 指定预检操作过程中的额外配置(包括黑名单、白名单等)的[config.json](https://gitee.com/ascend/mstt/tree/master/debug/accuracy_tools/msprobe/config)文件,默认未配置。config.json文件的配置可参考《[配置文件说明](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/config/README.md#pytorch场景task配置为run_ut)》。 | 否 | run_ut执行结果包括`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`两个文件。`accuracy_checking_result_{timestamp}.csv`是API粒度的,标明每个API是否通过测试。建议用户先查看`accuracy_checking_result_{timestamp}.csv`文件,对于其中没有通过测试的或者特定感兴趣的API,根据其API name字段在`accuracy_checking_details_{timestamp}.csv`中查询其各个输出的达标情况以及比较指标。详细介绍请参见“**预检结果**”。 2. (可选)如果需要保存比对不达标的输入和输出数据,可以在run_ut执行命令结尾添加-save_error_data,例如: ```bash - atat -f pytorch run_ut -api_info ./dump.json -save_error_data + msprobe -f pytorch run_ut -api_info ./dump.json -save_error_data ``` - 数据默认会存盘到'./ut_error_data{timestamp}'路径下(相对于启动run_ut的路径),有需要的话,用户可以通过修改mstt/debug/accuracy_tools/api_accuracy_checker目录下,config.yaml文件的error_data_path参数来配置保存路径,详见“config.yaml文件说明”。 + 数据默认会存盘到'./ut_error_data{timestamp}'路径下(相对于启动run_ut的路径),有需要的话,用户可以通过error_data_path参数来配置保存路径,error_data_path参数在[config.json](https://gitee.com/ascend/mstt/tree/master/debug/accuracy_tools/msprobe/config)文件或config.yaml文件配置,config.json文件需要在run_ut操作时通过-config参数指定,config.yaml文件详见“**config.yaml文件说明**”。 #### 使用multi_run_ut.py执行多线程预检 @@ -73,7 +75,7 @@ multi_run_ut.py脚本,可以并行执行多个run_ut操作,从而降低预 命令示例如下: ```bash -atat -f pytorch multi_run_ut -api_info ./dump.json -n 32 -d 0 1 2 3 +msprobe -f pytorch multi_run_ut -api_info ./dump.json -n 32 -d 0 1 2 3 ``` | 参数名称 | 说明 | 是否必选 | @@ -96,26 +98,68 @@ atat -f pytorch multi_run_ut -api_info ./dump.json -n 32 -d 0 1 2 3 断点续检操作通过如下命令执行: ```bash -atat -f pytorch run_ut -api_info ./dump.json -csv_path /home/xxx/ut/accuracy_checking_result_{timestamp}.csv +msprobe -f pytorch run_ut -api_info ./dump.json -csv_path /home/xxx/ut/accuracy_checking_result_{timestamp}.csv ``` -#### API预检白名单 +#### API预检黑名单和白名单 -run_ut过程支持API预检白名单,操作方式如下: +run_ut过程支持API预检黑名单和白名单,通过如下文件配置black_list(黑名单)或white_list(白名单)参数来指定不需要或需要预检的API名称: -修改mstt/debug/accuracy_tools/api_accuracy_checker目录下config.yaml文件的white_list参数,配置需要预检的API名称,详见“config.yaml文件说明”。 +- 配置[config.json](https://gitee.com/ascend/mstt/tree/master/debug/accuracy_tools/msprobe/config)文件,config.json文件需要在run_ut操作时通过-config参数指定。 +- 配置config.yaml文件,详见“**config.yaml文件说明**”。 + +config.json文件的优先级高于config.yaml文件,即执行config.json文件时,config.yaml文件的配置不生效。 ### config.yaml文件说明 -config.yaml文件可以通过配置参数来控制dump和run_ut操作的白名单等功能。 +config.yaml文件可以通过配置参数来控制dump和run_ut操作的白名单、黑名单等功能。操作步骤如下: + +1. 查找msprobe工具安装路径。 + + ```bash + pip show mindstudio-probe + ``` + + 输出结果如下示例: + + ```bash + Name: mindstudio-probe + Version: 1.0 + Summary: This is a pytorch precision comparison tools + Home-page: + Author: + Author-email: + License: + Location: /home/xx/anaconda3/envs/pt21py38/lib/python3.8/site-packages + Requires: numpy, openpyxl, pandas, pyyaml, rich, tqdm, wheel + Required-by: + ``` + + Location字段为msprobe工具的安装路径,那么config.yaml文件位置为/home/xx/anaconda3/envs/pt21py38/lib/python3.8/site-packages/msprobe/pytorch/api_accuracy_checker/config.yaml + +2. 进入config.yaml文件 + + ```bash + vi /home/xx/anaconda3/envs/pt21py38/lib/python3.8/site-packages/msprobe/pytorch/api_accuracy_checker/config.yaml + ``` + +3. 修改config.yaml文件参数。 + + ```yaml + white_list: [] + black_list: [] + error_data_path: './' + precision: 14 + ``` -文件路径为:mstt/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml + | 参数名称 | 说明 | 是否必选 | + | --------------- | ------------------------------------------------------------ | -------- | + | white_list | API dump白名单,仅对指定的API进行dump。参数示例:white_list=["conv1d", "conv2d"]。默认未配置白名单,即dump全量API数据。 | 否 | + | black_list | API dump黑名单,被指定的API不进行dump。参数示例:black_list=["conv1d", "conv2d"]。默认未配置黑名单,即dump全量API数据。 | 否 | + | error_data_path | 配置保存精度未达标的API输入输出数据路径。参数示例"error_data_path": "./"。默认为当前路径。 | 否 | + | precision | 浮点数表示位数,默认取小数点后14位。 | 否 | -| 参数名称 | 说明 | 是否必选 | -| --------------- | ------------------------------------------------------------ | -------- | -| white_list | API dump白名单,指定dump具体API数据,也可以直接配置预检的API白名单,详细请参见“**API预检白名单**”。参数示例:white_list=["conv1d", "conv2d"]。默认未配置白名单,即dump全量API数据。 | 否 | -| error_data_path | 配置保存精度未达标的API输入输出数据路径。 | 否 | -| precision | 浮点数表示位数,默认取小数点后14位。 | 否 | + 说明:white_list和black_list同时配置时,二者配置的API名单若无交集,则白名单生效,若API名单存在交集,则白名单排除的部分以及交集的API不进行dump。 ## 预检结果 @@ -203,7 +247,7 @@ API预检通过测试,则在`accuracy_checking_details_{timestamp}.csv`文件 需要同时获取NPU和GPU环境下run_ut操作的预检结果`accuracy_checking_details_{timestamp}.csv`文件。执行如下命令进行NPU和GPU预检结果的比对: ```bash -atat -f pytorch api_precision_compare -npu /home/xxx/npu/accuracy_checking_details_{timestamp}.csv -gpu /home/xxx/gpu/accuracy_checking_details_{timestamp}.csv -o /home/xxx/ +msprobe -f pytorch api_precision_compare -npu /home/xxx/npu/accuracy_checking_details_{timestamp}.csv -gpu /home/xxx/gpu/accuracy_checking_details_{timestamp}.csv -o /home/xxx/ ``` | 参数名称 | 说明 | 是否必选 | diff --git a/debug/accuracy_tools/atat/pytorch/doc/dump.md b/debug/accuracy_tools/msprobe/pytorch/doc/dump.md similarity index 66% rename from debug/accuracy_tools/atat/pytorch/doc/dump.md rename to debug/accuracy_tools/msprobe/pytorch/doc/dump.md index 1e401b4f5a2..7e393cd1026 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/dump.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/dump.md @@ -1,8 +1,8 @@ # **精度数据采集** -atat工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。 +msprobe工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。 -执行dump操作需要安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 +执行dump操作需要安装msprobe工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 ## dump接口介绍 @@ -12,7 +12,7 @@ atat工具主要通过在训练脚本内添加dump接口并启动训练的方式 通过加载dump配置文件的方式来确定dump操作的详细配置。 -可以在from atat.pytorch import PrecisionDebugger和模型初始化之间的任意位置添加该接口。 +PrecisionDebugger接口可以在from msprobe.pytorch import PrecisionDebugger之后的位置添加。详细使用可参考“**示例代码**”或“**model配置代码示例**”。 **原型** @@ -20,7 +20,7 @@ atat工具主要通过在训练脚本内添加dump接口并启动训练的方式 PrecisionDebugger(config_path=None, task=None, dump_path=None, level=None, model=None, step=None) ``` -说明:上述参数除config_path和model外,其他参数均在[config.json](../../config)文件中可配,此处的参数优先级高于[config.json](../../config)文件中的配置,而config.json文件可以配置更多参数,若需要进行更多场景的精度数据dump,建议配置[config.json](../../config)文件。 +说明:上述参数除config_path和model外,其他参数均在[config.json](../../config)文件中可配,此处的参数优先级高于[config.json](../../config)文件中的配置,而config.json文件可以配置更多参数,若需要进行更多场景的精度数据dump,建议配置[config.json](../../config)文件。config.json文件的配置可参考《[配置文件说明](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/config/README.md)》。 **参数说明** @@ -44,7 +44,7 @@ import torch import torch.nn as nn import torch_npu import torch.nn.functional as F -from atat.pytorch import PrecisionDebugger +from msprobe.pytorch import PrecisionDebugger torch.npu.set_device("npu:0") #定义一个简单的网络 @@ -77,9 +77,9 @@ if __name__ == "__main__" **功能说明** -启动函数。 +dump启动函数。 -在模型初始化之后的任意位置添加。 +在模型初始化之后的位置添加。需要与stop函数一起添加在for循环内。 **原型** @@ -93,9 +93,9 @@ debugger.start() **功能说明** -停止函数。 +dump停止函数。 -在**start**函数之后的任意位置添加。 +在**start**函数之后的任意位置添加。若需要dump反向数据,则需要添加在反向计算代码(如loss.backward)之后。 **原型** @@ -105,13 +105,33 @@ debugger.stop() 该函数为类函数,可以使用debugger.stop()也可以使用PrecisionDebugger.stop()。 +### forward_backward_dump_end函数 + +**功能说明** + +dump停止函数。用于dump指定代码的前反向数据。 + +在**start**函数之后,反向计算代码(如loss.backward)之前的任意位置添加,可以dump **start**函数和该函数之间的前反向数据,可以通过调整**start**函数与该函数的位置,来指定需要dump的代码块。 + +要求**stop**函数添加在反向计算代码(如loss.backward)之后,此时该函数与**stop**函数之间的代码不会被dump。 + +使用示例参见“**示例代码 > 扩展示例**”。 + +**原型** + +```Python +forward_backward_dump_end() +``` + +该函数为类函数,可以使用debugger.forward_backward_dump_end()也可以使用PrecisionDebugger.forward_backward_dump_end()。 + ### step函数 **功能说明** 结束标识。 -在最后一个**stop**函数后或一个step结束的位置添加。 +在最后一个**stop**函数后或一个step结束的位置添加。需要与start函数一起添加在for循环内。 **原型** @@ -123,24 +143,57 @@ debugger.step() ## 示例代码 +### 基础操作 + +如下示例可dump完整代码的前反向数据。 + ```Python -from atat.pytorch import PrecisionDebugger +from msprobe.pytorch import PrecisionDebugger + +# 请勿将PrecisionDebugger的初始化流程插入到循环代码中 debugger = PrecisionDebugger(config_path="./config.json", dump_path="./dump_path") -# 请勿将以上初始化流程插入到循环代码中 -# 模型初始化 -# 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() -debugger.start() +# 模型、损失函数的定义及初始化等操作 +# ... -# 需要dump的代码片段1 +# 数据集迭代的位置一般为模型训练开始的位置 +for data, label in data_loader: + debugger.start() # 开启数据dump -debugger.stop() -debugger.start() + # 如下是模型每个step执行的逻辑 + output = model(data) + #... + loss.backward() + + debugger.stop() # 关闭数据dump + debugger.step() # 结束一个step的dump +``` -# 需要dump的代码片段2 +### 扩展示例 -debugger.stop() -debugger.step() +如下示例dump指定代码块前反向数据。 + +```Python +from msprobe.pytorch import PrecisionDebugger + +# 请勿将PrecisionDebugger的初始化流程插入到循环代码中 +debugger = PrecisionDebugger(config_path="./config.json", dump_path="./dump_path") + +# 模型、损失函数的定义及初始化等操作 +# ... + +# 数据集迭代的位置一般为模型训练开始的位置 +for data, label in data_loader: + debugger.start() # 开启数据dump + + # 如下是模型每个step执行的逻辑 + output = model(data) + debugger.forward_backward_dump_end() # 插入该函数到start函数之后,只dump start函数到该函数之间代码的前反向数据,本函数到stop函数之间的数据则不dump + #... + loss.backward() + + debugger.stop() # 关闭数据dump + debugger.step() # 结束一个step的dump ``` ## dump结果文件介绍 @@ -193,7 +246,7 @@ pt文件保存的前缀和PyTorch对应关系如下: ## 工具支持的API列表 -atat工具维护固定的API支持列表,若需要删除或增加dump的API,可以在atat/pytorch/hook_module/support_wrap_ops.yaml文件内手动修改,如下示例: +msprobe工具维护固定的API支持列表,若需要删除或增加dump的API,可以在msprobe/pytorch/hook_module/support_wrap_ops.yaml文件内手动修改,如下示例: ```Python functional: # functional为算子类别,找到对应的类别,在该类别下按照下列格式删除或添加API diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_1.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_1.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_2.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_2.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_3.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_3.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_4.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/BLOOM-7B_4.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_1.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_1.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_2.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_2.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_3.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_3.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_4.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_4.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_5.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_5.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_6.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_6.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_7.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_7.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_8.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/GPT-3_8.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/YOLOV5S_1.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/YOLOV5S_1.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/YOLOV5S_2.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/YOLOV5S_2.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/accuracy_checking_details.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/accuracy_checking_details.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/accuracy_checking_result.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/accuracy_checking_result.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/api_precision_compare_details.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/api_precision_compare_details.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/api_precision_compare_result.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/api_precision_compare_result.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/auto_analyze_log.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/auto_analyze_log.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/compare_result_pkl.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/compare_result_pkl.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/compare_result_pkl_md5.png.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/compare_result_pkl_md5.png.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/cpu_info.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/cpu_info.png diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png b/debug/accuracy_tools/msprobe/pytorch/doc/img/module_compare.png similarity index 100% rename from debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png rename to debug/accuracy_tools/msprobe/pytorch/doc/img/module_compare.png diff --git "a/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" "b/debug/accuracy_tools/msprobe/pytorch/doc/msprobe\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" similarity index 97% rename from "debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" rename to "debug/accuracy_tools/msprobe/pytorch/doc/msprobe\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" index ed175ff3017..c9db3ae78d7 100644 --- "a/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" +++ "b/debug/accuracy_tools/msprobe/pytorch/doc/msprobe\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" @@ -1,4 +1,4 @@ -# atat精度工具标准性能基线报告 +# msprobe精度工具标准性能基线报告 ## 环境信息 @@ -16,7 +16,7 @@ CANN:8.0.T2 ## 模型信息和性能基线 -大模型在使用atat工具dump数据时,建议先简化模型层数,减少dump数据量。 +大模型在使用msprobe工具dump数据时,建议先简化模型层数,减少dump数据量。 以下场景的性能基线测试数据均为多次测试后取平均值,因此实际运行时性能数据可能会根据环境状态稍有浮动。 diff --git a/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md b/debug/accuracy_tools/msprobe/pytorch/doc/parse_tool.md similarity index 98% rename from debug/accuracy_tools/atat/pytorch/doc/parse_tool.md rename to debug/accuracy_tools/msprobe/pytorch/doc/parse_tool.md index 23000912910..81efa10fa3e 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/parse_tool.md @@ -6,10 +6,10 @@ ## 进入parse交互式界面 -安装atat工具后(详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节),可以通过使用命令 **atat -f pytorch parse** 进入交互式界面,如下所示: +安装msprobe工具后(详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节),可以通过使用命令 **msprobe -f pytorch parse** 进入交互式界面,如下所示: ```bash -atat -f pytorch parse +msprobe -f pytorch parse Parse >>> ``` @@ -23,7 +23,7 @@ Parse >>> Ctrl+C可以退出parse交互式界面。不退出parse交互式界面若需要执行非该界面下的内置Shell命令,且命令与parse交互式界面命令冲突时,非该界面命令需要使用run命令,在相关命令前加上run前缀,如下示例: ```bash -atat -f pytorch parse +msprobe -f pytorch parse Parse >>> run vim cli.py Parse >>> vim cli.py ``` diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md b/debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_compare.md similarity index 99% rename from debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md rename to debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_compare.md index e3537594c4f..4bd05c73e21 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_compare.md @@ -44,7 +44,7 @@ compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs) 创建比对脚本,例如compare_distributed.py,拷贝如下代码,具体参数请根据实际环境修改。 ```Python -from atat.pytorch import * +from msprobe.pytorch import * compare_distributed('./npu_dump/step0', './gpu_dump/step0', './output') ``` @@ -77,7 +77,7 @@ compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_mat 单机单卡场景下创建比对脚本,例如compare.py,拷贝如下代码,具体参数请根据实际环境修改。 ```Python -from atat.pytorch import compare +from msprobe.pytorch import compare dump_result_param={ "npu_json_path": "./npu_dump/dump.json", "bench_json_path": "./gpu_dump/dump.json", @@ -96,7 +96,7 @@ compare(dump_result_param, output_path="./output", stack_mode=True) 以compare.py为例。 ```Python -from atat.pytorch import compare +from msprobe.pytorch import compare dump_result_param={ "npu_json_path": "./npu_dump/dump.json", "bench_json_path": "./gpu_dump/dump.json", diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md b/debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_overview.md similarity index 81% rename from debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md rename to debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_overview.md index 708d90b3487..01945145487 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_overview.md @@ -4,7 +4,7 @@ 在PyTorch训练网络,对同一模型或API调试过程中,遇到API相关的计算精度问题,定位时费时费力。 -atat的精度比对工具,用来进行PyTorch整网API粒度的数据dump、精度比对和溢出检测,从而定位PyTorch训练场景下的精度问题。 +msprobe的精度比对工具,用来进行PyTorch整网API粒度的数据dump、精度比对和溢出检测,从而定位PyTorch训练场景下的精度问题。 **使用场景** @@ -42,17 +42,17 @@ atat的精度比对工具,用来进行PyTorch整网API粒度的数据dump、 1. 准备CPU或GPU训练工程。 -2. 在环境下安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 +2. 在环境下安装msprobe工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 -3. 在训练脚本内添加atat工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。 +3. 在训练脚本内添加msprobe工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。 4. 执行训练dump数据。 5. 将CPU或GPU训练工程迁移为NPU训练工程。详见《[PyTorch模型迁移调优指南](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html)》。 -6. 在NPU环境下安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 +6. 在NPU环境下安装msprobe工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 -7. 在NPU训练脚本内添加atat工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。 +7. 在NPU训练脚本内添加msprobe工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。 8. NPU环境下执行训练dump数据。 diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md b/debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_quickstart.md similarity index 94% rename from debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md rename to debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_quickstart.md index c0530205568..4b6ac9de2f0 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/ptdbg_ascend_quickstart.md @@ -1,8 +1,8 @@ # **精度比对工具** -本文主要介绍atat的精度比对工具的快速入门和场景化示例。 +本文主要介绍msprobe的精度比对工具的快速入门和场景化示例。 -本文介绍的操作需要安装atat工具,详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 +本文介绍的操作需要安装msprobe工具,详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 本文介绍的操作主要是精度数据dump和精度比对,详细操作指导可参考《[精度数据采集](./dump.md)》和《[CPU或GPU与NPU精度数据比对](./ptdbg_ascend.md)》。 @@ -51,12 +51,12 @@ PyTorch训练场景的精度问题分析建议参考以下思路进行精度比 } ``` -2. 在训练脚本内添加atat工具,dump整网数据。 +2. 在训练脚本内添加msprobe工具,dump整网数据。 分别dump CPU或GPU以及NPU数据,在PyTorch训练脚本插入dump接口,示例代码如下(下面以NPU为例,CPU或GPU dump基本相同): ```python - from atat.pytorch import PrecisionDebugger + from msprobe.pytorch import PrecisionDebugger debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") # 请勿将以上初始化流程插入到循环代码中 @@ -82,7 +82,7 @@ PyTorch训练场景的精度问题分析建议参考以下思路进行精度比 创建并配置精度比对脚本,以创建compare.py为例,示例代码如下: ```python - from atat.pytorch import compare + from msprobe.pytorch import compare dump_result_param={ "npu_json_path": "./npu_dump/dump.json", "bench_json_path": "./gpu_dump/dump.json", @@ -140,10 +140,10 @@ python3 compare.py } ``` -2. 在NPU训练脚本内添加atat工具,执行溢出检测dump。 +2. 在NPU训练脚本内添加msprobe工具,执行溢出检测dump。 ```python - from atat.pytorch import PrecisionDebugger + from msprobe.pytorch import PrecisionDebugger debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") # 请勿将以上初始化流程插入到循环代码中 @@ -171,7 +171,7 @@ python3 compare.py 溢出解析工具执行命令如下: ```bash - atat -f pytorch run_overflow_check -api_info ./dump.json + msprobe -f pytorch run_overflow_check -api_info ./dump.json ``` 反向过程溢出的API暂不支持精度预检功能。 @@ -200,7 +200,7 @@ python3 compare.py 1. 创建比对脚本,例如compare_distributed.py,拷贝如下代码。 ```python - from atat.pytorch import * + from msprobe.pytorch import * compare_distributed('./npu_dump/step0', './gpu_dump/step0', './output') ``` @@ -219,7 +219,7 @@ python3 compare.py 多卡一般为多进程,须保证每个进程都正确调用PrecisionDebugger,或把PrecisionDebugger插入到import语句后,如: ```python -from atat.pytorch import PrecisionDebugger +from msprobe.pytorch import PrecisionDebugger debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") ``` @@ -339,10 +339,10 @@ debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump" } ``` -2. 在训练脚本内添加atat工具,dump整网数据。 +2. 在训练脚本内添加msprobe工具,dump整网数据。 ```python - from atat.pytorch import PrecisionDebugger + from msprobe.pytorch import PrecisionDebugger debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") # 请勿将以上初始化流程插入到循环代码中 diff --git a/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md b/debug/accuracy_tools/msprobe/pytorch/doc/run_overflow_check.md similarity index 95% rename from debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md rename to debug/accuracy_tools/msprobe/pytorch/doc/run_overflow_check.md index 1bdc4f354cf..b8c9c3b4c29 100644 --- a/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md +++ b/debug/accuracy_tools/msprobe/pytorch/doc/run_overflow_check.md @@ -13,7 +13,7 @@ 2. 执行溢出API解析操作。 ```bash - atat -f pytorch run_overflow_check -api_info ./dump.json + msprobe -f pytorch run_overflow_check -api_info ./dump.json ``` | 参数名称 | 说明 | 是否必选 | diff --git "a/debug/accuracy_tools/atat/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" "b/debug/accuracy_tools/msprobe/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" similarity index 95% rename from "debug/accuracy_tools/atat/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" rename to "debug/accuracy_tools/msprobe/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" index b2e373feb6c..05bebaf0a22 100644 --- "a/debug/accuracy_tools/atat/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" +++ "b/debug/accuracy_tools/msprobe/pytorch/doc/\345\234\250\347\272\277\347\262\276\345\272\246\346\257\224\345\257\271.md" @@ -32,8 +32,8 @@ PyTorch NPU在线精度比对是ptdbg_ascend工具实现在PyTorch训练过程 1. 在NPU训练脚本中添加在线精度比对接口,示例如下: ```python - from atat.pytorch.common.utils import seed_all - from atat.pytorch.online_dispatch import PtdbgDispatch + from msprobe.pytorch.common.utils import seed_all + from msprobe.pytorch.online_dispatch import PtdbgDispatch # 在main函数开始前固定随机数 seed_all() @@ -74,12 +74,12 @@ PyTorch NPU在线精度比对是ptdbg_ascend工具实现在PyTorch训练过程 | process_num | 多进程并发数,默认为0。 | 否 | | debug | debug信息打印,默认为False。 | 否 | ### dump数据存盘说明 -dump数据存盘目录名格式:`atat_tag_rankid_{timestamp}`。 +dump数据存盘目录名格式:`msprobe_tag_rankid_{timestamp}`。 子目录下包含1个比对结果csv文件、cpu和npudump数据目录,npu目录下包含Aten IR在NPU上的输入输出的dump数据,由于CPU的输入是直接使用NPU的输入执行,因此cpu目录下只包含执行输出的dump数据。 ```bash -atat_rank4_20230911170521 +msprobe_rank4_20230911170521 ├── compare_result_rank4_20230911170521.csv ├── cpu │   ├── native_batch_norm_backward_10_output.0.npy diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/__init__.py similarity index 43% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/__init__.py index b9d41330a87..d234898c0df 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/__init__.py @@ -1,6 +1,6 @@ -from atat.core.common.log import logger -from atat.core.common.exceptions import FreeBenchmarkException -from atat.core.common.const import Const +from msprobe.core.common.log import logger +from msprobe.core.common.exceptions import FreeBenchmarkException +from msprobe.core.common.const import Const from .main import FreeBenchmarkCheck from .common.params import UnequalRow diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/__init__.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/common/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/constant.py similarity index 89% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/constant.py index 36b7a649158..c5e93be138d 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/constant.py @@ -2,8 +2,8 @@ from typing import Dict import numpy as np import torch -from atat.pytorch.free_benchmark.common.enums import FuzzThreshold -from atat.pytorch.free_benchmark.common.params import BenchmarkThd +from msprobe.pytorch.free_benchmark.common.enums import FuzzThreshold +from msprobe.pytorch.free_benchmark.common.params import BenchmarkThd class CommonField: @@ -52,6 +52,7 @@ class ThresholdConfig: DTYPE_PER_THD = { torch.float16: 1.002, + torch.bfloat16: 1.004, torch.float32: 1.0002, } BENCHMARK_THD_DICT = { @@ -60,6 +61,8 @@ class ThresholdConfig: torch.bfloat16: BenchmarkThd(2**-8, 1.0, 2**-8, 1e-4), } + TENSOR_SPLIT_MAX_CHUNK = 128 + class PreheatConfig: IF_PREHEAT = "if_preheat" diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/counter.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/counter.py index 186b75c71ae..b2f8c81f3a4 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/counter.py @@ -1,5 +1,5 @@ from collections import defaultdict -from atat.pytorch.free_benchmark.common.constant import ThresholdConfig +from msprobe.pytorch.free_benchmark.common.constant import ThresholdConfig class PreheatCounter: diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/params.py similarity index 93% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/params.py index 440348d78c2..bbfc245a635 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/params.py @@ -2,13 +2,13 @@ from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Tuple import torch -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.enums import ( +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.enums import ( DeviceType, FuzzLevel, PerturbationMode, ) -from atat.pytorch.free_benchmark.common.utils import Tools +from msprobe.pytorch.free_benchmark.common.utils import Tools @dataclass @@ -78,7 +78,7 @@ def data_pre_deal(name, func, args, kwargs): data_params.valid_input_index = index if index == -1: logger.warning_on_rank_0( - f"[atat] Free benchmark: 无标杆工具不支持当前算子的输入类型 {name}." + f"[msprobe] Free benchmark: 无标杆工具不支持当前算子的输入类型 {name}." ) return data_params diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py similarity index 92% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py index 24d25967635..631beeb85cb 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py @@ -1,5 +1,5 @@ import torch -from atat.pytorch.free_benchmark.common.enums import DeviceType +from msprobe.pytorch.free_benchmark.common.enums import DeviceType class Tools: @@ -96,3 +96,7 @@ class TorchC: add = torch._C._VariableFunctionsClass.add bitwise_xor = torch._C._VariableFunctionsClass.bitwise_xor clone = torch._C._VariableFunctionsClass.clone + clamp = torch._C._VariableFunctionsClass.clamp + tensor_split = torch._C._VariableFunctionsClass.tensor_split + stack = torch._C._VariableFunctionsClass.stack + reshape = torch._C._VariableFunctionsClass.reshape diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/compare/grad_saver.py similarity index 89% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/compare/grad_saver.py index 89ef9e4c9b4..6781a1c2fc4 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/compare/grad_saver.py @@ -1,10 +1,10 @@ import torch -from atat.core.common.exceptions import FreeBenchmarkException -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import CommonField -from atat.pytorch.free_benchmark.common.params import DataParams, HandlerParams -from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory -from atat.pytorch.free_benchmark.result_handlers.handler_factory import ( +from msprobe.core.common.exceptions import FreeBenchmarkException +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import CommonField +from msprobe.pytorch.free_benchmark.common.params import DataParams, HandlerParams +from msprobe.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory +from msprobe.pytorch.free_benchmark.result_handlers.handler_factory import ( FuzzHandlerFactory, ) @@ -41,18 +41,18 @@ class GradSaver: data_processor.update_unequal_rows(handler.get_unequal_rows()) except IndexError: logger.warning_on_rank_0( - f"[atat] Free benchmark: grad index out of range. api:{self.handler_params.api_name}." + f"[msprobe] Free benchmark: grad index out of range. api:{self.handler_params.api_name}." f"index:{new_grad_index}, perturbation grad len {len(self.perturbed_grad_input)}" ) return grad except FreeBenchmarkException as e: logger.warning_on_rank_0( - f"[atat] Free benchmark: grad input check error: {e}" + f"[msprobe] Free benchmark: grad input check error: {e}" ) return grad except Exception as e: logger.warning_on_rank_0( - f"[atat] Free benchmark: grad compare error: {e}" + f"[msprobe] Free benchmark: grad compare error: {e}" ) return grad return grad @@ -77,7 +77,7 @@ class GradSaver: handler.handle(self.data_params) except Exception as e: logger.warning_on_rank_0( - f"[atat] Free benchmark: compare two vjp failed: api:{self.handler_params.api_name}." + f"[msprobe] Free benchmark: compare two vjp failed: api:{self.handler_params.api_name}." f"{e}" ) # 在扰动前后输出对比后释放输出的引用 diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/compare/single_benchmark.py similarity index 89% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/compare/single_benchmark.py index 85aa68f13b9..59239fcd004 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/compare/single_benchmark.py @@ -1,9 +1,9 @@ import math import torch -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import ThresholdConfig -from atat.pytorch.free_benchmark.common.utils import TorchC +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import ThresholdConfig +from msprobe.pytorch.free_benchmark.common.utils import TorchC class SingleCompare: @@ -28,6 +28,14 @@ class SingleCompare: tensor[inf_or_nan_mask] = 1 return tensor + @staticmethod + def compare_float_seq(actual, golden): + return math.isclose(actual, golden) + + @staticmethod + def compare_other_seq(actual, golden): + return actual == golden + def compare_dict_seq(self, actual, golden): if len(actual) != len(golden): return False @@ -61,7 +69,7 @@ class SingleCompare: actual.dtype, ThresholdConfig.BENCHMARK_THD_DICT.get(torch.float32) ) if self.filter_overflow(golden) > 0: - logger.warning_on_rank_0("[atat] Free Benchmark: inf and nan" + logger.warning_on_rank_0("[msprobe] Free Benchmark: inf and nan" "in golden tensor is not supported.") return True actual = self.replace_inf_or_nan(actual) @@ -76,12 +84,6 @@ class SingleCompare: return False return True - def compare_float_seq(self, actual, golden): - return math.isclose(actual, golden) - - def compare_other_seq(self, actual, golden): - return actual == golden - def _cal_compare_metrics(self, actual, golden): diff_value = TorchC.subtract(actual, golden) diff_abs = TorchC.abs(diff_value) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py similarity index 81% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/main.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py index 2ebc0a6db91..971776d1326 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py @@ -1,19 +1,19 @@ from abc import ABC import torch -from atat.core.common.const import Const -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import CommonField -from atat.pytorch.free_benchmark.common.enums import ( +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import CommonField +from msprobe.pytorch.free_benchmark.common.enums import ( DeviceType, FuzzLevel, HandlerType, PerturbationMode, ) -from atat.pytorch.free_benchmark.common.params import data_pre_deal, make_handler_params -from atat.pytorch.free_benchmark.compare.grad_saver import GradSaver -from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory -from atat.pytorch.free_benchmark.result_handlers.handler_factory import ( +from msprobe.pytorch.free_benchmark.common.params import data_pre_deal, make_handler_params +from msprobe.pytorch.free_benchmark.compare.grad_saver import GradSaver +from msprobe.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory +from msprobe.pytorch.free_benchmark.result_handlers.handler_factory import ( FuzzHandlerFactory, ) @@ -81,7 +81,7 @@ class FreeBenchmarkCheck(ABC): grad_saver = getattr(module, CommonField.GRADSAVER) except AttributeError: logger.warning_on_rank_0( - f"[atat] Free benchmark: get grad saver failed. api_name:{name}" + f"[msprobe] Free benchmark: get grad saver failed. api_name:{name}" ) return @@ -97,6 +97,6 @@ class FreeBenchmarkCheck(ABC): ) except Exception as e: logger.warning_on_rank_0( - f"[atat] Free benchmark: grad vjp calculate failed. api_name:{name} error: {e}" + f"[msprobe] Free benchmark: grad vjp calculate failed. api_name:{name} error: {e}" ) return diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/__init__.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py similarity index 78% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py index aa572fd8e8d..f64a201d5ef 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from typing import Any -from atat.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.common.params import DataParams class BaseLayer(ABC): diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py similarity index 62% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py index 0d09438ce04..0ea9107aa84 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py @@ -1,15 +1,15 @@ -from atat.pytorch.free_benchmark import FreeBenchmarkException -from atat.pytorch.free_benchmark.common.enums import DeviceType, PerturbationMode -from atat.pytorch.free_benchmark.perturbed_layers.npu.improve_precision import ( +from msprobe.pytorch.free_benchmark import FreeBenchmarkException +from msprobe.pytorch.free_benchmark.common.enums import DeviceType, PerturbationMode +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.improve_precision import ( ImprovePrecisionLayer, ) -from atat.pytorch.free_benchmark.perturbed_layers.npu.add_noise import AddNoiseLayer -from atat.pytorch.free_benchmark.perturbed_layers.npu.bit_noise import BitNoiseLayer -from atat.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer -from atat.pytorch.free_benchmark.perturbed_layers.npu.change_value import ( +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.add_noise import AddNoiseLayer +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.bit_noise import BitNoiseLayer +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.change_value import ( ChangeValueLayer, ) -from atat.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer +from msprobe.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer class LayerFactory: diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/__init__.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py similarity index 78% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py index af8a93f7d4b..a18ef1c51bd 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py @@ -1,10 +1,10 @@ import torch -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import ThresholdConfig -from atat.pytorch.free_benchmark.common.enums import PerturbationMode -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.common.utils import TorchC -from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import ThresholdConfig +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.common.utils import TorchC +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( NpuBaseLayer, ) @@ -37,7 +37,7 @@ class AddNoiseLayer(NpuBaseLayer): 对输入添加扰动并返回 """ logger.info_on_rank_0( - f"[atat] Free benchmark: Perturbation is " + f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.ADD_NOISE} of {self.api_name}." ) params.perturbed_value = self.add_noise(params.args[params.valid_input_index]) @@ -60,13 +60,13 @@ class AddNoiseLayer(NpuBaseLayer): """ if not self.perturbed_value: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.api_name}, " + f"[msprobe] Free Benchmark: For {self.api_name}, " f"dtype unsupported. Cancel perturbation." ) return False if tensor_obj.numel() == 0: logger.warning_on_rank_0( - f"[atat] Free benchmark: For {self.api_name}, tensor shape must > 0." + f"[msprobe] Free benchmark: For {self.api_name}, tensor shape must > 0." f" Cancel adding noise." ) return False @@ -77,13 +77,13 @@ class AddNoiseLayer(NpuBaseLayer): max_val = TorchC.max(TorchC.abs(tensor_obj)).item() except Exception: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.api_name}, " + f"[msprobe] Free Benchmark: For {self.api_name}, " f"when calculate maximun value, tensor is changed to float32." ) max_val = TorchC.max(TorchC.abs(tensor_obj.to(torch.float32))).item() if max_val < abs_tol: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.api_name}, " + f"[msprobe] Free Benchmark: For {self.api_name}, " f"Maximun value is less than the minimun threshold. Cancel add noise." ) return False diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py similarity index 80% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py index 40b99acf411..45dea7b93a5 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py @@ -1,10 +1,10 @@ import torch -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import ThresholdConfig -from atat.pytorch.free_benchmark.common.enums import PerturbationMode -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.common.utils import TorchC -from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import ThresholdConfig +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.common.utils import TorchC +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( NpuBaseLayer, ) @@ -53,7 +53,7 @@ class BitNoiseLayer(NpuBaseLayer): 对输入添加扰动并返回 """ logger.info_on_rank_0( - f"[atat] Free benchmark: Perturbation is " + f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.BIT_NOISE} of {self.api_name}." ) params.perturbed_value = self.add_bit_noise(params.args[params.valid_input_index]) @@ -65,13 +65,13 @@ class BitNoiseLayer(NpuBaseLayer): """ if not self.bit_type: logger.info_on_rank_0( - f"[atat] Free Benchmark: For {self.api_name}, " + f"[msprobe] Free Benchmark: For {self.api_name}, " f"dtype unsupported. Cancel perturbation." ) return False if tensor_obj.numel() == 0: logger.warning_on_rank_0( - f"[atat] Free benchmark: For {self.api_name}, tensor shape must > 0" + f"[msprobe] Free benchmark: For {self.api_name}, tensor shape must > 0" f" Cancel adding noise." ) return False @@ -82,13 +82,13 @@ class BitNoiseLayer(NpuBaseLayer): max_val = TorchC.max(TorchC.abs(tensor_obj)).item() except Exception: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.api_name}, " + f"[msprobe] Free Benchmark: For {self.api_name}, " f"when calculate maximun value, tensor is changed to float32." ) max_val = TorchC.max(TorchC.abs(tensor_obj.to(torch.float32))).item() if max_val < abs_tol: logger.info_on_rank_0( - f"[atat] Free Benchmark: For {self.api_name}, " + f"[msprobe] Free Benchmark: For {self.api_name}, " f"Maximun value is less than the minimun threshold. Cancel add noise." ) return False diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py similarity index 81% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py index b7a967e18b9..91085d57a68 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py @@ -1,9 +1,9 @@ import torch -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.enums import PerturbationMode -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.common.utils import TorchC -from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.common.utils import TorchC +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( NpuBaseLayer, ) @@ -44,7 +44,7 @@ class ChangeValueLayer(NpuBaseLayer): 对输入添加扰动并返回 """ logger.info_on_rank_0( - f"[atat] Free benchmark: Perturbation is " + f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.CHANGE_VALUE} of {self.api_name}." ) params.perturbed_value = self.change_value(params.args[params.valid_input_index]) @@ -56,7 +56,7 @@ class ChangeValueLayer(NpuBaseLayer): """ if tensor_obj.size(0) < 2: logger.info_on_rank_0( - f"[atat] Free Benchmark: For {self.api_name}, " + f"[msprobe] Free Benchmark: For {self.api_name}, " f"size 0 must greater than 1. Cancel change value." ) return False diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py similarity index 83% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py index 03718e3c4d6..ad6d8b8989d 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py @@ -1,10 +1,10 @@ import torch -from atat.core.common.const import Const -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import CommonField -from atat.pytorch.free_benchmark.common.enums import PerturbationMode -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import CommonField +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( NpuBaseLayer, ) @@ -34,7 +34,7 @@ class ImprovePrecisionLayer(NpuBaseLayer): def handle(self, params: DataParams) -> torch.Any: logger.info_on_rank_0( - f"[atat] Free benchmark: Perturbation is " + f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.IMPROVE_PRECISION} of {self.api_name}." ) new_args = self.improve_tensor_precision(params.args) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py similarity index 64% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py index bb065385c69..a69c56002a2 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py @@ -1,8 +1,8 @@ import torch -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.enums import PerturbationMode -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( NpuBaseLayer, ) @@ -21,7 +21,7 @@ class NoChangeLayer(NpuBaseLayer): 对输入添加扰动并返回 """ logger.info_on_rank_0( - f"[atat] Free benchmark: Perturbation is " + f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.NO_CHANGE} of {self.api_name}." ) params.perturbed_value = self.no_change(params.args[params.valid_input_index]) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py similarity index 90% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py index 3784af09530..1a859481475 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py @@ -2,8 +2,8 @@ from abc import abstractmethod from typing import Any import torch -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer class NpuBaseLayer(BaseLayer): diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py similarity index 52% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py index 024958ffbe1..d34ac976537 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py @@ -1,9 +1,9 @@ import torch -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.common.utils import Tools -from atat.pytorch.free_benchmark.common.enums import DeviceType -from atat.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.common.utils import Tools +from msprobe.pytorch.free_benchmark.common.enums import DeviceType +from msprobe.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer class CpuLayer(BaseLayer): @@ -11,7 +11,7 @@ class CpuLayer(BaseLayer): def handle(self, params: DataParams) -> torch.Any: logger.info_on_rank_0( - f"[atat] Free benchmark: Perturbation is to_cpu of {self.api_name}." + f"[msprobe] Free benchmark: Perturbation is to_cpu of {self.api_name}." ) new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True) new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/__init__.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/base_handler.py similarity index 64% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/base_handler.py index c57d7e390a0..e36f5867355 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/base_handler.py @@ -1,22 +1,23 @@ import math from abc import ABC, abstractmethod from typing import Any, Optional, Tuple +import numpy as np import torch -from atat.core.common.const import Const -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import ThresholdConfig -from atat.pytorch.free_benchmark.common.enums import ( +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import ThresholdConfig +from msprobe.pytorch.free_benchmark.common.enums import ( FuzzThreshold, NormType, PerturbationMode, ) -from atat.pytorch.free_benchmark.common.params import ( +from msprobe.pytorch.free_benchmark.common.params import ( DataParams, HandlerParams, make_unequal_row, ) -from atat.pytorch.free_benchmark.common.utils import Tools, TorchC +from msprobe.pytorch.free_benchmark.common.utils import Tools, TorchC class FuzzHandler(ABC): @@ -34,15 +35,36 @@ class FuzzHandler(ABC): origin_ouput = origin_ouput.values perturbed_output = perturbed_output.values if hasattr(perturbed_output, "dtype"): - abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(perturbed_output.dtype) + abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(perturbed_output.dtype, FuzzThreshold.F32_THD) else: - abs_tol = FuzzThreshold.F32_THD.value + abs_tol = FuzzThreshold.F32_THD return ( origin_ouput.to(perturbed_output.dtype).to(perturbed_output.device), perturbed_output, abs_tol, ) + @staticmethod + def tensor_split_for_error_calculate(origin_output, perturbed_output): + """ + 对将投入误差值计算的扰动前后输出张量进行分块 + :param origin_output: 原始输出 + :param perturbed_output: 扰动后输出 + :return origin_output_chunks: 切块后原始输出列表 + :return perturbed_output_chunks: 切块后扰动后输出列表 + """ + single_output_mem = origin_output.element_size() * origin_output.nelement() / Const.ONE_MB + if single_output_mem == 0 or origin_output.ndim == 0: + return [origin_output], [perturbed_output] + # 张量大小和批数之间的关系:chunks_exp=math.log(M,2)-4, chunks=2**chunks_exp (M为对比张量数据大小[Mb]) + chunks_exp = int(math.log(single_output_mem, 2)) - 4 + chunks = 2 ** chunks_exp + chunks = max(chunks, 1) + chunks = min(chunks, ThresholdConfig.TENSOR_SPLIT_MAX_CHUNK) + origin_output_chunks = TorchC.tensor_split(TorchC.reshape(origin_output, (-1,)), chunks) + perturbed_output_chunks = TorchC.tensor_split(TorchC.reshape(perturbed_output, (-1,)), chunks) + return origin_output_chunks, perturbed_output_chunks + @staticmethod def convert_overflow_ratio_to_consistent(ratio): if math.isnan(ratio) or math.isinf(ratio): @@ -61,36 +83,28 @@ class FuzzHandler(ABC): self, origin_output, perturbed_output, norm_type, abs_tol ): if norm_type == NormType.ENDLESS_NORM: - return self.get_endless_norm(origin_output, perturbed_output, abs_tol) + return self.calculate_error(origin_output, perturbed_output, abs_tol) return ThresholdConfig.COMP_CONSISTENT - def get_endless_norm(self, origin_output, perturbed_output, abs_tol): - ratio_tensor1 = TorchC.where( - TorchC.gt(TorchC.abs(perturbed_output), abs_tol), - TorchC.div( - TorchC.abs(origin_output), - TorchC.add(TorchC.abs(perturbed_output), abs_tol), - ), - 1, - ) - ratio_tensor2 = TorchC.where( - TorchC.gt(TorchC.abs(origin_output), abs_tol), - TorchC.div( - TorchC.abs(perturbed_output), - TorchC.add(TorchC.abs(origin_output), abs_tol), - ), - 1, - ) + def calculate_error(self, origin_output, perturbed_output, abs_tol): + origin_output_chunks, perturbed_output_chunks = self.tensor_split_for_error_calculate(origin_output, perturbed_output) + norm1 = -np.inf + norm2 = -np.inf + norm3 = np.inf + for i, chunk_origin in enumerate(origin_output_chunks): + if chunk_origin.nelement() == 0: + break + chunk_perturbed = perturbed_output_chunks[i] + ratio_tensor1 = TorchC.where(TorchC.abs(chunk_perturbed) > abs_tol, + TorchC.div(TorchC.clamp(chunk_origin, min=abs_tol), TorchC.clamp(chunk_perturbed, min=abs_tol)), 1) + ratio_tensor2 = TorchC.where(TorchC.abs(chunk_origin) > abs_tol, + TorchC.div(TorchC.clamp(chunk_perturbed, min=abs_tol), TorchC.clamp(chunk_origin, min=abs_tol)), 1) + norm_values = TorchC.stack([TorchC.max(ratio_tensor1), TorchC.max(ratio_tensor2)]) + max_ratio1, max_ratio2 = norm_values.tolist() + norm1 = max(norm1, self.convert_overflow_ratio_to_consistent(max_ratio1)) + norm2 = max(norm2, self.convert_overflow_ratio_to_consistent(max_ratio2)) + norm3 = min(norm3, self.convert_overflow_ratio_to_consistent(max_ratio1)) - norm1 = self.convert_overflow_ratio_to_consistent( - TorchC.max(ratio_tensor1).item() - ) - norm2 = self.convert_overflow_ratio_to_consistent( - TorchC.max(ratio_tensor2).item() - ) - norm3 = self.convert_overflow_ratio_to_consistent( - TorchC.min(ratio_tensor1).item() - ) if norm3 < 0: ratio = ThresholdConfig.SYMBOL_FLIPPING else: @@ -104,7 +118,7 @@ class FuzzHandler(ABC): ) except Exception as e: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.params.api_name}, " + f"[msprobe] Free Benchmark: For {self.params.api_name}, " f"when computing ratio," f" y1 or y2 dtype is not supported {e}" ) @@ -133,7 +147,7 @@ class FuzzHandler(ABC): ) elif not isinstance(perturbed_output, torch.Tensor): logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.params.api_name} " + f"[msprobe] Free Benchmark: For {self.params.api_name} " f"The compare for output type {type(perturbed_output)} is not supported" ) @@ -185,7 +199,7 @@ class FuzzHandler(ABC): ) except Exception as e: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.params.api_name}, " + f"[msprobe] Free Benchmark: For {self.params.api_name}, " f"when campare the result exception raise {e}" ) return npu_consistent, max_fuzz_ratio diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/check_handler.py similarity index 68% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/check_handler.py index ed846803a18..c16284eb07b 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/check_handler.py @@ -1,11 +1,11 @@ from typing import Any -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.enums import DeviceType -from atat.pytorch.free_benchmark.common.params import DataParams, make_unequal_row -from atat.pytorch.free_benchmark.common.utils import Tools -from atat.pytorch.free_benchmark.compare.single_benchmark import SingleCompare -from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.enums import DeviceType +from msprobe.pytorch.free_benchmark.common.params import DataParams, make_unequal_row +from msprobe.pytorch.free_benchmark.common.utils import Tools +from msprobe.pytorch.free_benchmark.compare.single_benchmark import SingleCompare +from msprobe.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler class CheckerHandler(FuzzHandler): @@ -33,7 +33,7 @@ class CheckerHandler(FuzzHandler): self.other_compare(data_params) except Exception as e: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.params.api_name}, " + f"[msprobe] Free Benchmark: For {self.params.api_name}, " f"when campare the result exception raise {e}" ) return data_params.original_result diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py similarity index 60% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py index fa5c6f37495..a1d90035e84 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py @@ -1,9 +1,9 @@ from typing import Any -from atat.pytorch.free_benchmark.common.params import DataParams -from atat.pytorch.free_benchmark.common.utils import Tools -from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler -from atat.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.common.utils import Tools +from msprobe.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler +from msprobe.pytorch.free_benchmark import logger class FixHandler(FuzzHandler): @@ -18,7 +18,7 @@ class FixHandler(FuzzHandler): ) except Exception as e: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.params.api_name} " + f"[msprobe] Free Benchmark: For {self.params.api_name} " f"Fix output failed. " ) return data_params.original_result \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py similarity index 59% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py index cff629854d9..5ee968c6a86 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py @@ -1,10 +1,10 @@ -from atat.pytorch.free_benchmark import FreeBenchmarkException -from atat.pytorch.free_benchmark.common.constant import PreheatConfig -from atat.pytorch.free_benchmark.common.enums import HandlerType -from atat.pytorch.free_benchmark.common.params import HandlerParams -from atat.pytorch.free_benchmark.result_handlers.check_handler import CheckerHandler -from atat.pytorch.free_benchmark.result_handlers.preheat_handler import PreheatHandler -from atat.pytorch.free_benchmark.result_handlers.fix_handler import FixHandler +from msprobe.pytorch.free_benchmark import FreeBenchmarkException +from msprobe.pytorch.free_benchmark.common.constant import PreheatConfig +from msprobe.pytorch.free_benchmark.common.enums import HandlerType +from msprobe.pytorch.free_benchmark.common.params import HandlerParams +from msprobe.pytorch.free_benchmark.result_handlers.check_handler import CheckerHandler +from msprobe.pytorch.free_benchmark.result_handlers.preheat_handler import PreheatHandler +from msprobe.pytorch.free_benchmark.result_handlers.fix_handler import FixHandler class FuzzHandlerFactory: diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py similarity index 88% rename from debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py rename to debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py index 033a6d4931f..d78e4303620 100644 --- a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py @@ -1,14 +1,14 @@ import math from typing import Any -from atat.pytorch.free_benchmark import logger -from atat.pytorch.free_benchmark.common.constant import ThresholdConfig -from atat.pytorch.free_benchmark.common.counter import preheat_counter -from atat.pytorch.free_benchmark.common.enums import DeviceType -from atat.pytorch.free_benchmark.common.params import DataParams, HandlerParams -from atat.pytorch.free_benchmark.common.utils import Tools -from atat.pytorch.free_benchmark.compare.single_benchmark import SingleCompare -from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import ThresholdConfig +from msprobe.pytorch.free_benchmark.common.counter import preheat_counter +from msprobe.pytorch.free_benchmark.common.enums import DeviceType +from msprobe.pytorch.free_benchmark.common.params import DataParams, HandlerParams +from msprobe.pytorch.free_benchmark.common.utils import Tools +from msprobe.pytorch.free_benchmark.compare.single_benchmark import SingleCompare +from msprobe.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler class PreheatHandler(FuzzHandler): @@ -74,14 +74,14 @@ class PreheatHandler(FuzzHandler): cpu_consistent = self.compare_npu_and_cpu(data_params) except Exception as e: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.params.api_name}, " + f"[msprobe] Free Benchmark: For {self.params.api_name}, " f"when campare to cpu exception raise {e}" ) try: first_dtype = Tools.get_first_tensor_dtype(data_params.original_result) except RuntimeError: logger.warning_on_rank_0( - f"[atat] Free Benchmark: For {self.params.api_name}, " + f"[msprobe] Free Benchmark: For {self.params.api_name}, " f"the output sequence does not contain tensors." ) if preheat_counter.get_api_preheat(self.pure_name, str(first_dtype)): @@ -96,7 +96,7 @@ class PreheatHandler(FuzzHandler): if res: total_count = preheat_counter.get_one_step_used_api(self.pure_name) logger.info_on_rank_0( - f"[atat] Free benchmark: preheat sample in step{self.params.step}" + f"[msprobe] Free benchmark: preheat sample in step{self.params.step}" f"api_name {self.params.api_name}, " f"curr_called_seq: {curr_called_seq}/{total_count}" ) diff --git a/debug/accuracy_tools/msprobe/pytorch/function_factory.py b/debug/accuracy_tools/msprobe/pytorch/function_factory.py new file mode 100644 index 00000000000..c2fd8bfd0cb --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/function_factory.py @@ -0,0 +1,75 @@ +from msprobe.pytorch.common.utils import logger +from msprobe.pytorch.bench_functions.apply_adam_w import npu_apply_adam_w +from msprobe.pytorch.bench_functions.confusion_transpose import npu_confusion_transpose, \ + npu_confusion_transpose_backward +from msprobe.pytorch.bench_functions.fast_gelu import fast_gelu, npu_fast_gelu_backward +from msprobe.pytorch.bench_functions.layer_norm_eval import npu_layer_norm_eval +from msprobe.pytorch.bench_functions.linear import npu_linear, npu_linear_backward +from msprobe.pytorch.bench_functions.matmul_backward import matmul_backward +from msprobe.pytorch.bench_functions.npu_fusion_attention import npu_fusion_attention, npu_fusion_attention_grad +from msprobe.pytorch.bench_functions.rms_norm import npu_rms_norm, npu_rms_norm_backward +from msprobe.pytorch.bench_functions.rotary_mul import npu_rotary_mul, npu_rotary_mul_backward +from msprobe.pytorch.bench_functions.scaled_mask_softmax import npu_scaled_masked_softmax, \ + npu_scaled_masked_softmax_backward +from msprobe.pytorch.bench_functions.swiglu import npu_swiglu, npu_swiglu_backward, swish_grad, swish + + +class Register(dict): + def __init__(self, *args, **kwargs): + super(Register, self).__init__(*args, **kwargs) + self._dict = {} + + def __call__(self, target_func_list): + for target in target_func_list: + self.register(target) + return + + def __setitem__(self, key, value): + self._dict[key] = value + + def __getitem__(self, key): + return self._dict[key] + + def __contains__(self, key): + return key in self._dict + + def __str__(self): + return str(self._dict) + + def keys(self): + return self._dict.keys() + + def values(self): + return self._dict.values() + + def items(self): + return self._dict.items() + + def register(self, target): + + def add_register_item(key, value): + if key in self._dict: + logger.warning(f"{value.__name__} has been registered before, so we will overriden it.") + self[key] = value + return value + + if callable(target): + return add_register_item(target.__name__, target) + else: + raise Exception(f"The func {target} is not callable.") + + +# register for npu custom bench functions +npu_custom_functions = Register() +npu_custom_functions([ + npu_apply_adam_w, npu_confusion_transpose, fast_gelu, npu_layer_norm_eval, npu_linear, npu_fusion_attention, + npu_rms_norm, npu_rotary_mul, npu_scaled_masked_softmax, npu_swiglu +]) + +# register for npu custom backward bench functions +npu_custom_grad_functions = Register() +npu_custom_grad_functions([ + npu_confusion_transpose_backward, npu_fast_gelu_backward, npu_linear_backward, matmul_backward, + npu_fusion_attention_grad, npu_rms_norm_backward, npu_rotary_mul_backward, npu_scaled_masked_softmax_backward, + npu_swiglu_backward +]) diff --git a/debug/accuracy_tools/atat/pytorch/functional/__init__.py b/debug/accuracy_tools/msprobe/pytorch/functional/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/functional/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/functional/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/functional/data_processor.py b/debug/accuracy_tools/msprobe/pytorch/functional/data_processor.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/functional/data_processor.py rename to debug/accuracy_tools/msprobe/pytorch/functional/data_processor.py diff --git a/debug/accuracy_tools/atat/pytorch/functional/dump_module.py b/debug/accuracy_tools/msprobe/pytorch/functional/dump_module.py similarity index 73% rename from debug/accuracy_tools/atat/pytorch/functional/dump_module.py rename to debug/accuracy_tools/msprobe/pytorch/functional/dump_module.py index 675fa2a1bfd..efb95c3369f 100644 --- a/debug/accuracy_tools/atat/pytorch/functional/dump_module.py +++ b/debug/accuracy_tools/msprobe/pytorch/functional/dump_module.py @@ -1,10 +1,10 @@ import torch.nn as nn -from atat.pytorch.common.log import logger -from atat.core.common.const import Const -from atat.pytorch.hook_module.api_registry import api_register -from atat.pytorch.debugger.precision_debugger import PrecisionDebugger -from atat.core.common.exceptions import MsaccException -from atat.core.data_dump.scope import BaseScope +from msprobe.pytorch.common.log import logger +from msprobe.core.common.const import Const +from msprobe.pytorch.hook_module.api_registry import api_register +from msprobe.pytorch.debugger.precision_debugger import PrecisionDebugger +from msprobe.core.common.exceptions import MsprobeException +from msprobe.core.data_dump.scope import BaseScope module_count = {} @@ -12,10 +12,10 @@ module_count = {} def module_dump(module, dump_name): if not isinstance(module, nn.Module): logger.error("The parameter:module in module_dump is not a Module subclass.") - raise MsaccException(MsaccException.INVALID_PARAM_ERROR) + raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR) if not isinstance(dump_name, str): logger.error("The parameter:dump_name in module_dump is not a str type.") - raise MsaccException(MsaccException.INVALID_PARAM_ERROR) + raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR) api_register.api_originality() if dump_name not in module_count: module_count[dump_name] = 0 diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/__init__.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/hook_module/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/api_registry.py similarity index 91% rename from debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/api_registry.py index 3b971cc71ec..f75201eafcd 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/api_registry.py @@ -18,15 +18,15 @@ import torch import torch.distributed as dist -from atat.pytorch.hook_module import wrap_torch, wrap_functional, wrap_tensor, wrap_vf, wrap_distributed, wrap_aten -from atat.pytorch.hook_module.wrap_aten import get_aten_ops -from atat.pytorch.hook_module.wrap_distributed import get_distributed_ops -from atat.pytorch.hook_module.wrap_functional import get_functional_ops -from atat.pytorch.hook_module.wrap_tensor import get_tensor_ops -from atat.pytorch.hook_module.wrap_torch import get_torch_ops -from atat.pytorch.hook_module.wrap_vf import get_vf_ops -from atat.pytorch.common.utils import torch_without_guard_version, npu_distributed_api, is_gpu -from atat.core.common.const import Const +from msprobe.pytorch.hook_module import wrap_torch, wrap_functional, wrap_tensor, wrap_vf, wrap_distributed, wrap_aten +from msprobe.pytorch.hook_module.wrap_aten import get_aten_ops +from msprobe.pytorch.hook_module.wrap_distributed import get_distributed_ops +from msprobe.pytorch.hook_module.wrap_functional import get_functional_ops +from msprobe.pytorch.hook_module.wrap_tensor import get_tensor_ops +from msprobe.pytorch.hook_module.wrap_torch import get_torch_ops +from msprobe.pytorch.hook_module.wrap_vf import get_vf_ops +from msprobe.pytorch.common.utils import torch_without_guard_version, npu_distributed_api, is_gpu +from msprobe.core.common.const import Const torch_version_above_2 = torch.__version__.split('+')[0] > '2.0' diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/hook_module.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/hook_module.py index 57212b6e45c..ff6427e51e5 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/hook_module.py @@ -17,10 +17,12 @@ import functools import threading + import torch import torch.nn as nn import torch.utils.hooks as full_hooks -from atat.core.common.const import Const + +from msprobe.core.common.const import Const class HOOKModule(nn.Module): @@ -61,6 +63,10 @@ class HOOKModule(nn.Module): HOOKModule.inner_stop_hook[self.current_thread] = False return result + @classmethod + def reset_module_stats(cls): + cls.module_count = {} + def _call_func(self, *input, **kwargs): full_backward_hooks, non_full_backward_hooks = [], [] if len(self._backward_hooks) > 0: diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml similarity index 99% rename from debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml rename to debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml index d64c577ff38..f68708e945e 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml @@ -1873,4 +1873,5 @@ distributed: - reduce_scatter - _reduce_scatter_base - _all_gather_base - - all_to_all_single \ No newline at end of file + - all_to_all_single + - all_to_all \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/utils.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/utils.py similarity index 95% rename from debug/accuracy_tools/atat/pytorch/hook_module/utils.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/utils.py index e4ed157af6d..c1e581675fa 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/utils.py @@ -18,7 +18,7 @@ import os import yaml -from atat.core.common.file_check import FileOpen +from msprobe.core.common.file_check import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_aten.py similarity index 71% rename from debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_aten.py index c5a3c6365d1..a02abbe5f4b 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_aten.py @@ -20,16 +20,18 @@ import torch import yaml -from atat.pytorch.hook_module.hook_module import HOOKModule -from atat.pytorch.common.utils import torch_device_guard -from atat.core.common.const import Const -from atat.core.common.file_check import FileOpen - +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.common.utils import torch_device_guard +from msprobe.core.common.const import Const +from msprobe.core.common.file_check import FileOpen +from msprobe.pytorch.function_factory import npu_custom_grad_functions cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") with FileOpen(yaml_path, 'r') as f: - WrapAtenOps = yaml.safe_load(f).get('aten') + Ops = yaml.safe_load(f) + WrapAtenOps = Ops.get('aten') + WhiteAtenOps = Ops.get('white_aten_ops', []) aten_func = {} @@ -48,7 +50,7 @@ class HOOKAtenOP(object): class AtenOPTemplate(HOOKModule): - def __init__(self, op, hook): + def __init__(self, op, hook, need_hook=True): if isinstance(op, torch._ops.OpOverloadPacket): op_name_ = op._qualified_op_name.split("::")[-1] else: @@ -58,10 +60,21 @@ class AtenOPTemplate(HOOKModule): op_name_ = op_name_ + '.' + overload_name self.op = op self.prefix_op_name_ = "Aten" + Const.SEP + str(op_name_) + Const.SEP - super().__init__(hook) + self.need_hook = need_hook + if self.need_hook: + super().__init__(hook) @torch_device_guard def forward(self, *args, **kwargs): + if isinstance(self.op, str): + if self.op in npu_custom_grad_functions: + return npu_custom_grad_functions[self.op](*args, **kwargs) + if self.op in WhiteAtenOps: + return eval(f"torch.ops.aten.{self.op}")(*args, **kwargs) + if self.op not in aten_func: + raise Exception(f"Skip op[{self.op}] accuracy check, because the op is not " + f"in dir(torch.ops.aten) and support yaml.") + return aten_func[self.op](*args, **kwargs) return self.op(*args, **kwargs) @@ -80,13 +93,13 @@ class AtenOPPacketTemplate(): else: return attr - def overloads(self): - return self.opPacket.overloads() - @torch_device_guard def __call__(self, *args, **kwargs): return AtenOPTemplate(self.opPacket, self.hook)(*args, **kwargs) + def overloads(self): + return self.opPacket.overloads() + def wrap_aten_op(op, hook): return AtenOPPacketTemplate(op, hook) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_distributed.py similarity index 91% rename from debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_distributed.py index e02189ac1bf..6cf425441cc 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_distributed.py @@ -20,10 +20,10 @@ from functools import wraps import torch.distributed as dist import yaml -from atat.pytorch.hook_module.hook_module import HOOKModule -from atat.pytorch.common.utils import torch_device_guard -from atat.core.common.const import Const -from atat.core.common.file_check import FileOpen +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.common.utils import torch_device_guard +from msprobe.core.common.const import Const +from msprobe.core.common.file_check import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_functional.py similarity index 94% rename from debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_functional.py index fa97f5ee310..fd7610ca8fc 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_functional.py @@ -20,11 +20,11 @@ import os import torch import yaml -from atat.pytorch.hook_module.hook_module import HOOKModule -from atat.pytorch.common.utils import torch_device_guard -from atat.core.common.const import Const -from atat.pytorch.common.log import logger -from atat.core.common.file_check import FileOpen +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.common.utils import torch_device_guard +from msprobe.core.common.const import Const +from msprobe.pytorch.common.log import logger +from msprobe.core.common.file_check import FileOpen def remove_dropout(): diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_npu_custom.py similarity index 71% rename from debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_npu_custom.py index 7d0882804f4..8a67ed94290 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_npu_custom.py @@ -17,19 +17,26 @@ import os import torch -import torch_npu import yaml -from atat.pytorch.hook_module.hook_module import HOOKModule -from atat.pytorch.common.utils import torch_device_guard, torch_without_guard_version -from atat.core.common.const import Const -from atat.core.common.file_check import FileOpen +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.common.utils import torch_device_guard, torch_without_guard_version +from msprobe.core.common.const import Const +from msprobe.core.common.file_check import FileOpen +from msprobe.pytorch.function_factory import npu_custom_functions cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") with FileOpen(yaml_path, 'r') as f: WrapNpuOps = yaml.safe_load(f).get('torch_npu') +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + def get_npu_ops(): global WrapNpuOps @@ -46,13 +53,19 @@ class HOOKNpuOP(object): class NpuOPTemplate(HOOKModule): - def __init__(self, op_name, hook): + def __init__(self, op_name, hook, need_hook=True): self.op_name_ = op_name self.prefix_op_name_ = "NPU" + Const.SEP + str(op_name) + Const.SEP - super().__init__(hook) + self.need_hook = need_hook + if need_hook: + super().__init__(hook) @torch_device_guard def forward(self, *args, **kwargs): + if not self.need_hook: + if self.op_name_ not in npu_custom_functions: + raise Exception(f'There is not bench function {self.op_name_}') + return npu_custom_functions[self.op_name_](*args, **kwargs) if torch_without_guard_version: return getattr(torch.ops.npu, str(self.op_name_))(*args, **kwargs) else: @@ -60,7 +73,6 @@ class NpuOPTemplate(HOOKModule): def wrap_npu_op(op_name, hook): - def npu_op_template(*args, **kwargs): return NpuOPTemplate(op_name, hook)(*args, **kwargs) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_tensor.py similarity index 89% rename from debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_tensor.py index 6fac1814023..3e26ae3beda 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_tensor.py @@ -20,10 +20,10 @@ import os import torch import yaml -from atat.pytorch.hook_module.hook_module import HOOKModule -from atat.pytorch.common.utils import torch_device_guard, parameter_adapter -from atat.core.common.const import Const -from atat.core.common.file_check import FileOpen +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.common.utils import torch_device_guard, parameter_adapter +from msprobe.core.common.const import Const +from msprobe.core.common.file_check import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_torch.py similarity index 91% rename from debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_torch.py index f0bd01fe462..486ddda4919 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_torch.py @@ -20,10 +20,10 @@ import os import torch import yaml -from atat.pytorch.hook_module.hook_module import HOOKModule -from atat.pytorch.common.utils import torch_device_guard -from atat.core.common.const import Const -from atat.core.common.file_check import FileOpen +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.common.utils import torch_device_guard +from msprobe.core.common.const import Const +from msprobe.core.common.file_check import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_vf.py similarity index 88% rename from debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py rename to debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_vf.py index d4c570221d4..d78beb2a6ad 100644 --- a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/wrap_vf.py @@ -20,10 +20,10 @@ import os import torch import yaml -from atat.pytorch.hook_module.hook_module import HOOKModule -from atat.core.common.file_check import FileOpen -from atat.pytorch.common.utils import torch_device_guard -from atat.core.common.const import Const +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.core.common.file_check import FileOpen +from msprobe.pytorch.common.utils import torch_device_guard +from msprobe.core.common.const import Const cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") diff --git a/debug/accuracy_tools/atat/pytorch/module_processer.py b/debug/accuracy_tools/msprobe/pytorch/module_processer.py similarity index 72% rename from debug/accuracy_tools/atat/pytorch/module_processer.py rename to debug/accuracy_tools/msprobe/pytorch/module_processer.py index 8ce9140e32c..3e9969d32d9 100644 --- a/debug/accuracy_tools/atat/pytorch/module_processer.py +++ b/debug/accuracy_tools/msprobe/pytorch/module_processer.py @@ -1,15 +1,17 @@ from functools import wraps + import torch from torch.utils.hooks import BackwardHook -from atat.core.common.const import Const -from atat.core.data_dump.scope import ModuleRangeScope + +from msprobe.core.common.const import Const +from msprobe.core.data_dump.scope import ModuleRangeScope class ModuleProcesser: + module_count = {} module_stack = [] api_parent_node = "" module_node = {} - current_module_name = "" def __init__(self, scope): if isinstance(scope, ModuleRangeScope): @@ -19,15 +21,22 @@ class ModuleProcesser: BackwardHook.setup_input_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_input_hook) BackwardHook.setup_output_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_output_hook) BackwardHook.setup_output_hook = ModuleProcesser.filter_tensor_and_tuple(BackwardHook.setup_output_hook) - self.module_count = {} @staticmethod def filter_tensor_and_tuple(func): @wraps(func) def wrap_by_filter_tensor_and_tuple(*args, **kwargs): - # setup_output_hook传入非tensor数据,工具后续dump会报错,处理方式是非tensor数据不传入 + # setup_output_hook传入非tensor数据,工具后续dump会报错,处理方式是解析非tensor数据的属性,对tensor属性挂hook # setup_output_hook定义为setup_output_hook(self, args),因此处理第二个位置参数,即*args[1] if not isinstance(args[1], (torch.Tensor, tuple)): + for item_str in dir(args[1]): + item = getattr(args[1], item_str) + # 处理tensor或者只包含tensor的元组 + if isinstance(item, torch.Tensor) or \ + (isinstance(item, tuple) and all(isinstance(x, torch.Tensor) for x in item)): + args_new = (args[0], item) + result = func(*args_new, **kwargs) + setattr(args[1], item_str, result) return args[1] return func(*args, **kwargs) @@ -55,11 +64,26 @@ class ModuleProcesser: else: return result + @staticmethod + def module_count_func(module_name): + if module_name not in ModuleProcesser.module_count: + ModuleProcesser.module_count[module_name] = 0 + else: + ModuleProcesser.module_count[module_name] += 1 + return ModuleProcesser.module_count[module_name] + + @classmethod + def reset_module_stats(cls): + cls.module_count = {} + cls.module_stack = [] + cls.api_parent_node = "" + cls.module_node = {} + def node_hook(self, name_prefix, start_or_stop, **kwargs): def pre_hook(module, input, output=None): try: - index = self.module_count_func(name_prefix) + index = ModuleProcesser.module_count_func(name_prefix) except IndexError as e: index = None pass @@ -89,10 +113,3 @@ class ModuleProcesser: return pre_hook else: return end_hook - - def module_count_func(self, module_name): - if module_name not in self.module_count: - self.module_count[module_name] = 0 - else: - self.module_count[module_name] += 1 - return self.module_count[module_name] diff --git a/debug/accuracy_tools/atat/pytorch/online_dispatch/__init__.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/online_dispatch/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/online_dispatch/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/online_dispatch/compare.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/compare.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/online_dispatch/compare.py rename to debug/accuracy_tools/msprobe/pytorch/online_dispatch/compare.py index e6d55ca0614..048ab3f901c 100644 --- a/debug/accuracy_tools/atat/pytorch/online_dispatch/compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/compare.py @@ -8,10 +8,10 @@ from rich.table import Table from rich.console import Console from .single_compare import single_benchmark_compare_wrap from .utils import DispatchException -from atat.core.common.const import CompareConst -from atat.core.common.file_check import FileOpen -from atat.pytorch.common.log import logger -from atat.core.common.utils import CompareException +from msprobe.core.common.const import CompareConst +from msprobe.core.common.file_check import FileOpen +from msprobe.pytorch.common.log import logger +from msprobe.core.common.utils import CompareException ELEMENT_NUM_THRESHOLD = 100 ZERO_NUM_THRESHOLD = 0.1 @@ -228,7 +228,7 @@ class Comparator: else: is_bwd_success, bwd_compare_alg_results = True, None if is_bwd_success and bwd_compare_alg_results is None: - self.saver.record_results(ResultInfo(api_name, is_fwd_success, CompareConst.NA, fwd_compare_alg_results, + self.saver.record_results(ResultInfo(api_name, is_fwd_success, CompareConst.NAN, fwd_compare_alg_results, bwd_compare_alg_results)) else: self.saver.record_results(ResultInfo(api_name, is_fwd_success, is_bwd_success, fwd_compare_alg_results, diff --git a/debug/accuracy_tools/atat/pytorch/online_dispatch/dispatch.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/online_dispatch/dispatch.py rename to debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py index 7502d746acf..898df30b99d 100644 --- a/debug/accuracy_tools/atat/pytorch/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py @@ -21,9 +21,9 @@ from .dump_compare import dispatch_workflow, dispatch_multiprocess, error_call, from .utils import get_callstack, data_to_cpu, logger_debug, logger_error, logger_warn, logger_logo, get_sys_info, \ DispatchException from .compare import Comparator -from atat.core.common.file_check import FileOpen -from atat.core.common.utils import check_file_or_directory_path, check_path_before_create -from atat.core.common.const import Const, CompareConst +from msprobe.core.common.file_check import FileOpen +from msprobe.core.common.utils import check_file_or_directory_path, check_path_before_create +from msprobe.core.common.const import Const, CompareConst current_time = time.strftime("%Y%m%d%H%M%S") RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv" @@ -209,9 +209,9 @@ class PtdbgDispatch(TorchDispatchMode): time_now = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())) if tag is None or not isinstance(tag, str): logger_warn('There is not tag or the type of tag is not string.') - dir_name = f'atat_rank{self.device_id}_{time_now}' + dir_name = f'msprobe_rank{self.device_id}_{time_now}' else: - dir_name = f'atat_{tag}_rank{self.device_id}_{time_now}' + dir_name = f'msprobe_{tag}_rank{self.device_id}_{time_now}' return dir_name def load_yaml_file(self, file_path): diff --git a/debug/accuracy_tools/atat/pytorch/online_dispatch/dump_compare.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/online_dispatch/dump_compare.py rename to debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py index cd7c5a3f282..f83b6fc9f00 100644 --- a/debug/accuracy_tools/atat/pytorch/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py @@ -7,9 +7,9 @@ import pandas as pd import torch from .utils import np_save_data, logger_debug, logger_error, logger_warn, logger_user, COLOR_RED, COLOR_GREEN, \ COLOR_RESET, CSV_COLUMN_NAME -from atat.core.common.file_check import FileOpen, change_mode -from atat.core.common.const import CompareConst, FileCheckConst, Const -from atat.pytorch.common.log import logger +from msprobe.core.common.file_check import FileOpen, change_mode +from msprobe.core.common.const import CompareConst, FileCheckConst, Const +from msprobe.pytorch.common.log import logger class DispatchRunParam: def __init__(self, debug_flag, device_id, root_npu_path, root_cpu_path, process_num, comparator): diff --git a/debug/accuracy_tools/atat/pytorch/online_dispatch/single_compare.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/single_compare.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/online_dispatch/single_compare.py rename to debug/accuracy_tools/msprobe/pytorch/online_dispatch/single_compare.py diff --git a/debug/accuracy_tools/atat/pytorch/online_dispatch/torch_ops_config.yaml b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml similarity index 100% rename from debug/accuracy_tools/atat/pytorch/online_dispatch/torch_ops_config.yaml rename to debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml diff --git a/debug/accuracy_tools/atat/pytorch/online_dispatch/utils.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/utils.py similarity index 97% rename from debug/accuracy_tools/atat/pytorch/online_dispatch/utils.py rename to debug/accuracy_tools/msprobe/pytorch/online_dispatch/utils.py index f3fcffb6f26..fec3e0b0074 100644 --- a/debug/accuracy_tools/atat/pytorch/online_dispatch/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/utils.py @@ -12,8 +12,8 @@ except ImportError: else: pta_cpu_device = torch.device("cpu") -from atat.core.common.const import CompareConst, FileCheckConst -from atat.core.common.file_check import change_mode +from msprobe.core.common.const import CompareConst, FileCheckConst +from msprobe.core.common.file_check import change_mode cpu_device = torch._C.device("cpu") COLOR_RED = '\033[31m' diff --git a/debug/accuracy_tools/atat/pytorch/parse.py b/debug/accuracy_tools/msprobe/pytorch/parse.py similarity index 50% rename from debug/accuracy_tools/atat/pytorch/parse.py rename to debug/accuracy_tools/msprobe/pytorch/parse.py index 40792d0e029..efd3d4a2ddb 100644 --- a/debug/accuracy_tools/atat/pytorch/parse.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse.py @@ -1,4 +1,4 @@ -from atat.pytorch.parse_tool import cli +from msprobe.pytorch.parse_tool import cli if __name__ == '__main__': cli.parse() diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/__init__.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/parse_tool/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/cli.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/cli.py similarity index 89% rename from debug/accuracy_tools/atat/pytorch/parse_tool/cli.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/cli.py index f59fbf13a8d..500e8eef684 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/cli.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/cli.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -from atat.pytorch.parse_tool.lib.interactive_cli import InteractiveCli -from atat.pytorch.common.log import logger +from msprobe.pytorch.parse_tool.lib.interactive_cli import InteractiveCli +from msprobe.pytorch.common.log import logger def _run_interactive_cli(cli=None): diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/__init__.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/__init__.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/__init__.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/__init__.py diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/compare.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/compare.py similarity index 92% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/compare.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/compare.py index dfc4529414c..2b091c59e8c 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/compare.py @@ -19,9 +19,9 @@ import os import time import numpy as np from collections import namedtuple -from atat.pytorch.parse_tool.lib.utils import Util -from atat.pytorch.parse_tool.lib.config import Const -from atat.pytorch.parse_tool.lib.parse_exception import ParseException +from msprobe.pytorch.parse_tool.lib.utils import Util +from msprobe.pytorch.parse_tool.lib.config import Const +from msprobe.pytorch.parse_tool.lib.parse_exception import ParseException class Compare: @@ -83,16 +83,17 @@ class Compare: (left, right, save_txt, rl, al, diff_count) = args if left is None or right is None: raise ParseException("invalid input or output") - try: - left_data = np.load(left) - right_data = np.load(right) - except UnicodeError as e: - self.log.error("%s %s" % ("UnicodeError", str(e))) - self.log.warning("Please check the npy file") - raise ParseException(ParseException.PARSE_UNICODE_ERROR) from e - except IOError: - self.log.error("Failed to load npy %s or %s." % (left, right)) - raise ParseException(ParseException.PARSE_LOAD_NPY_ERROR) from e + if self.util.check_path_valid(left) and self.util.check_path_valid(right): + try: + left_data = np.load(left) + right_data = np.load(right) + except UnicodeError as e: + self.log.error("%s %s" % ("UnicodeError", str(e))) + self.log.warning("Please check the npy file") + raise ParseException(ParseException.PARSE_UNICODE_ERROR) from e + except IOError: + self.log.error("Failed to load npy %s or %s." % (left, right)) + raise ParseException(ParseException.PARSE_LOAD_NPY_ERROR) from e # save to txt if save_txt: @@ -157,8 +158,10 @@ class Compare: return res def compare_npy(self, file, bench_file, output_path): - data = np.load(file) - bench_data = np.load(bench_file) + if self.util.check_path_valid(file): + data = np.load(file) + if self.util.check_path_valid(bench_file): + bench_data = np.load(bench_file) shape, dtype = data.shape, data.dtype bench_shape, bench_dtype = bench_data.shape, bench_data.dtype filename = os.path.basename(file) diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/config.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/config.py similarity index 98% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/config.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/config.py index a745ff46f08..a9a8b2b00e2 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/config.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/config.py @@ -33,7 +33,7 @@ class Const: OFFLINE_DUMP_CONVERT_PATTERN = \ r"^([A-Za-z0-9_-]+)\.([A-Za-z0-9_-]+)\.([0-9]+)(\.[0-9]+)?\.([0-9]{1,255})" \ r"\.([a-z]+)\.([0-9]{1,255})(\.[x0-9]+)?\.npy$" - NUMPY_PATTERN = r".*\.npy$" + NUMPY_PATTERN = r"^[\w\-_-]\.npy$" NPY_SUFFIX = ".npy" PKL_SUFFIX = ".pkl" DIRECTORY_LENGTH = 4096 diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/file_desc.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/file_desc.py similarity index 100% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/file_desc.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/file_desc.py diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/interactive_cli.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/interactive_cli.py similarity index 93% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/interactive_cli.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/interactive_cli.py index 12b07183fbc..1ea7dd30153 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/interactive_cli.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/interactive_cli.py @@ -16,10 +16,10 @@ """ import cmd import argparse -from atat.pytorch.parse_tool.lib.parse_tool import ParseTool -from atat.pytorch.parse_tool.lib.utils import Util -from atat.pytorch.parse_tool.lib.config import Const -from atat.pytorch.parse_tool.lib.parse_exception import catch_exception +from msprobe.pytorch.parse_tool.lib.parse_tool import ParseTool +from msprobe.pytorch.parse_tool.lib.utils import Util +from msprobe.pytorch.parse_tool.lib.config import Const +from msprobe.pytorch.parse_tool.lib.parse_exception import catch_exception class InteractiveCli(cmd.Cmd): diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/parse_exception.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/parse_exception.py similarity index 96% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/parse_exception.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/parse_exception.py index 1177c51985d..7525230cedc 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/parse_exception.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/parse_exception.py @@ -15,7 +15,7 @@ # limitations under the License. """ import logging -from atat.core.common.exceptions import FileCheckException +from msprobe.core.common.exceptions import FileCheckException class ParseException(Exception): diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/parse_tool.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/parse_tool.py similarity index 95% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/parse_tool.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/parse_tool.py index 3e02baa1272..9a47dc54cf9 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/parse_tool.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/parse_tool.py @@ -18,11 +18,11 @@ import argparse import os from collections import namedtuple -from atat.pytorch.parse_tool.lib.config import Const -from atat.pytorch.parse_tool.lib.utils import Util -from atat.pytorch.parse_tool.lib.compare import Compare -from atat.pytorch.parse_tool.lib.visualization import Visualization -from atat.pytorch.parse_tool.lib.parse_exception import catch_exception, ParseException +from msprobe.pytorch.parse_tool.lib.config import Const +from msprobe.pytorch.parse_tool.lib.utils import Util +from msprobe.pytorch.parse_tool.lib.compare import Compare +from msprobe.pytorch.parse_tool.lib.visualization import Visualization +from msprobe.pytorch.parse_tool.lib.parse_exception import catch_exception, ParseException class ParseTool: diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/utils.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/utils.py similarity index 94% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/utils.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/utils.py index ce42d242ba2..17a01f20fb0 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/utils.py @@ -25,15 +25,15 @@ import csv import time import numpy as np from collections import namedtuple -from atat.pytorch.parse_tool.lib.config import Const -from atat.pytorch.parse_tool.lib.file_desc import DumpDecodeFileDesc, FileDesc -from atat.pytorch.parse_tool.lib.parse_exception import ParseException -from atat.core.common.file_check import change_mode, check_other_user_writable,\ +from msprobe.pytorch.parse_tool.lib.config import Const +from msprobe.pytorch.parse_tool.lib.file_desc import DumpDecodeFileDesc, FileDesc +from msprobe.pytorch.parse_tool.lib.parse_exception import ParseException +from msprobe.core.common.file_check import change_mode, check_other_user_writable,\ check_path_executable, check_path_owner_consistent -from atat.core.common.const import FileCheckConst -from atat.core.common.file_check import FileOpen -from atat.core.common.utils import check_file_or_directory_path -from atat.pytorch.common.log import logger +from msprobe.core.common.const import FileCheckConst +from msprobe.core.common.file_check import FileOpen +from msprobe.core.common.utils import check_file_or_directory_path, check_path_before_create +from msprobe.pytorch.common.log import logger try: @@ -73,16 +73,6 @@ class Util: def path_strip(path): return path.strip("'").strip('"') - @staticmethod - def _gen_npu_dump_convert_file_info(name, match, dir_path): - return DumpDecodeFileDesc(name, dir_path, int(match.groups()[-4]), op_name=match.group(2), - op_type=match.group(1), task_id=int(match.group(3)), anchor_type=match.groups()[-3], - anchor_idx=int(match.groups()[-2])) - - @staticmethod - def _gen_numpy_file_info(name, math, dir_path): - return FileDesc(name, dir_path) - @staticmethod def check_executable_file(path): check_path_owner_consistent(path) @@ -184,6 +174,16 @@ class Util: def change_filemode_safe(self, path): change_mode(path, FileCheckConst.DATA_FILE_AUTHORITY) + @staticmethod + def _gen_npu_dump_convert_file_info(name, match, dir_path): + return DumpDecodeFileDesc(name, dir_path, int(match.groups()[-4]), op_name=match.group(2), + op_type=match.group(1), task_id=int(match.group(3)), anchor_type=match.groups()[-3], + anchor_idx=int(match.groups()[-2])) + + @staticmethod + def _gen_numpy_file_info(name, math, dir_path): + return FileDesc(name, dir_path) + def execute_command(self, cmd): if not cmd: self.log.error("Commond is None") @@ -245,7 +245,11 @@ class Util: elif data.size % align != 0: pad_array = np.zeros((align - data.size % align,)) data = np.append(data, pad_array) - np.savetxt(dst_file, data.reshape((-1, align)), delimiter=' ', fmt='%g') + check_path_before_create(dst_file) + try: + np.savetxt(dst_file, data.reshape((-1, align)), delimiter=' ', fmt='%g') + except Exception as e: + self.log.error("An unexpected error occurred: %s when savetxt to %s" % (str(e)), dst_file) change_mode(dst_file, FileCheckConst.DATA_FILE_AUTHORITY) def list_convert_files(self, path, external_pattern=""): diff --git a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/visualization.py b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/visualization.py similarity index 94% rename from debug/accuracy_tools/atat/pytorch/parse_tool/lib/visualization.py rename to debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/visualization.py index 3ef9878ae82..5e37b58d0b9 100644 --- a/debug/accuracy_tools/atat/pytorch/parse_tool/lib/visualization.py +++ b/debug/accuracy_tools/msprobe/pytorch/parse_tool/lib/visualization.py @@ -17,10 +17,10 @@ import json import numpy as np -from atat.pytorch.parse_tool.lib.config import Const -from atat.pytorch.parse_tool.lib.utils import Util -from atat.pytorch.parse_tool.lib.parse_exception import ParseException -from atat.core.common.file_check import FileOpen +from msprobe.pytorch.parse_tool.lib.config import Const +from msprobe.pytorch.parse_tool.lib.utils import Util +from msprobe.pytorch.parse_tool.lib.parse_exception import ParseException +from msprobe.core.common.file_check import FileOpen class Visualization: diff --git a/debug/accuracy_tools/atat/pytorch/pt_config.py b/debug/accuracy_tools/msprobe/pytorch/pt_config.py similarity index 58% rename from debug/accuracy_tools/atat/pytorch/pt_config.py rename to debug/accuracy_tools/msprobe/pytorch/pt_config.py index 0674b91b341..ceec92a633a 100644 --- a/debug/accuracy_tools/atat/pytorch/pt_config.py +++ b/debug/accuracy_tools/msprobe/pytorch/pt_config.py @@ -1,9 +1,10 @@ import json import os -from atat.core.common_config import CommonConfig, BaseConfig -from atat.core.common.file_check import FileOpen -from atat.core.common.const import Const +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.core.common.file_check import FileOpen +from msprobe.core.common.const import Const +from msprobe.pytorch.hook_module.utils import WrapFunctionalOps, WrapTensorOps, WrapTorchOps class TensorConfig(BaseConfig): @@ -31,12 +32,12 @@ class StatisticsConfig(BaseConfig): class OverflowCheckConfig(BaseConfig): def __init__(self, json_config): super().__init__(json_config) - self.overflow_num = json_config.get("overflow_nums") + self.overflow_nums = json_config.get("overflow_nums") self.check_mode = json_config.get("check_mode") self.check_overflow_config() def check_overflow_config(self): - if self.overflow_num is not None and not isinstance(self.overflow_num, int): + if self.overflow_nums is not None and not isinstance(self.overflow_nums, int): raise Exception("overflow_num is invalid") if self.check_mode is not None and self.check_mode not in ["all", "aicore", "atomic"]: raise Exception("check_mode is invalid") @@ -61,20 +62,54 @@ class FreeBenchmarkCheckConfig(BaseConfig): if self.preheat_step and self.preheat_step == 0: raise Exception("preheat_step cannot be 0") + +class RunUTConfig(BaseConfig): + WrapApi = set(WrapFunctionalOps) | set(WrapTensorOps) | set(WrapTorchOps) + def __init__(self, json_config): + super().__init__(json_config) + self.white_list = json_config.get("white_list", Const.DEFAULT_LIST) + self.black_list = json_config.get("black_list", Const.DEFAULT_LIST) + self.error_data_path = json_config.get("error_data_path", Const.DEFAULT_PATH) + self.check_run_ut_config() + + @classmethod + def check_filter_list_config(cls, key, filter_list): + if not isinstance(filter_list, list): + raise Exception("%s must be a list type" % key) + if not all(isinstance(item, str) for item in filter_list): + raise Exception("All elements in %s must be string type" % key) + invalid_api = [item for item in filter_list if item not in cls.WrapApi] + if invalid_api: + raise Exception("Invalid api in %s: %s" % (key, invalid_api)) + + @classmethod + def check_error_data_path_config(cls, error_data_path): + if not os.path.exists(error_data_path): + raise Exception("error_data_path: %s does not exist" % error_data_path) + + def check_run_ut_config(self): + RunUTConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list) + RunUTConfig.check_filter_list_config(Const.BLACK_LIST, self.black_list) + RunUTConfig.check_error_data_path_config(self.error_data_path) + + def parse_task_config(task, json_config): default_dic = {} if task == Const.TENSOR: - config_dic = json_config.get(Const.TENSOR) if json_config.get(Const.TENSOR) else default_dic + config_dic = json_config.get(Const.TENSOR, default_dic) return TensorConfig(config_dic) elif task == Const.STATISTICS: - config_dic = json_config.get(Const.STATISTICS) if json_config.get(Const.STATISTICS) else default_dic + config_dic = json_config.get(Const.STATISTICS, default_dic) return StatisticsConfig(config_dic) elif task == Const.OVERFLOW_CHECK: - config_dic = json_config.get(Const.OVERFLOW_CHECK) if json_config.get(Const.OVERFLOW_CHECK) else default_dic + config_dic = json_config.get(Const.OVERFLOW_CHECK, default_dic) return OverflowCheckConfig(config_dic) elif task == Const.FREE_BENCHMARK: - config_dic = json_config.get(Const.FREE_BENCHMARK) if json_config.get(Const.FREE_BENCHMARK) else default_dic + config_dic = json_config.get(Const.FREE_BENCHMARK, default_dic) return FreeBenchmarkCheckConfig(config_dic) + elif task == Const.RUN_UT: + config_dic = json_config.get(Const.RUN_UT, default_dic) + return RunUTConfig(config_dic) else: return StatisticsConfig(default_dic) diff --git a/debug/accuracy_tools/atat/pytorch/service.py b/debug/accuracy_tools/msprobe/pytorch/service.py similarity index 82% rename from debug/accuracy_tools/atat/pytorch/service.py rename to debug/accuracy_tools/msprobe/pytorch/service.py index d0b9c4d4b27..6b8d67abc9f 100644 --- a/debug/accuracy_tools/atat/pytorch/service.py +++ b/debug/accuracy_tools/msprobe/pytorch/service.py @@ -2,17 +2,18 @@ import functools import os from pathlib import Path -from atat.pytorch.common.log import logger -from atat.core.common.file_check import FileChecker, check_path_before_create -from atat.core.common.const import Const, FileCheckConst -from atat.core.common.exceptions import DistributedNotInitializedError, MsaccException -from atat.core.data_dump.data_collector import build_data_collector -from atat.core.data_dump.scope import BaseScope -from atat.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs -from atat.pytorch.common.utils import get_rank_if_initialized -from atat.pytorch.module_processer import ModuleProcesser -from atat.pytorch.hook_module import remove_dropout -from atat.pytorch.hook_module.api_registry import api_register +from msprobe.core.common.const import Const, FileCheckConst +from msprobe.core.common.exceptions import DistributedNotInitializedError, MsprobeException +from msprobe.core.common.file_check import FileChecker, check_path_before_create +from msprobe.core.data_dump.data_collector import build_data_collector +from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs +from msprobe.core.data_dump.scope import BaseScope +from msprobe.pytorch.common.log import logger +from msprobe.pytorch.common.utils import get_rank_if_initialized +from msprobe.pytorch.hook_module import remove_dropout +from msprobe.pytorch.hook_module.api_registry import api_register +from msprobe.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.module_processer import ModuleProcesser class Service: @@ -27,6 +28,11 @@ class Service: self.current_rank = None self.dump_iter_dir = None + @staticmethod + def forward_backward_dump_end(): + logger.info_on_rank_0("Data needed ends here.") + api_register.api_originality() + def build_hook(self, module_type, name): def pre_hook(api_or_module_name, module, args, kwargs): if module_type == BaseScope.Module_Type_Module: @@ -62,7 +68,8 @@ class Service: if not self.switch: return if self.data_collector: - module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_input, grad_output=grad_output) + # 此处获取到的grad_input实际为反向过程的输出数据,grad_output为反向过程的输入数据,因此传入时调换顺序 + module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input) self.data_collector.backward_data_collect(api_or_module_name, module, pid, module_input_output) pid = os.getpid() @@ -77,11 +84,14 @@ class Service: self.current_iter += 1 self.data_collector.update_iter(self.current_iter) - def start(self, model): + ModuleProcesser.reset_module_stats() + HOOKModule.reset_module_stats() + + def start(self, model, api_origin=False): self.model = model if self.config.step and self.current_iter > max(self.config.step): self.stop() - raise Exception("atat: exit after iteration {}".format(max(self.config.step))) + raise Exception("msprobe: exit after iteration {}".format(max(self.config.step))) if self.config.step and self.current_iter not in self.config.step: return if self.first_start: @@ -94,6 +104,8 @@ class Service: return self.register_hook_new() self.first_start = False + if api_origin: + api_register.api_modularity() self.switch = True logger.info_on_rank_0(f"Dump switch is turned on at step {self.current_iter}. ") if self.config.level != "L2": @@ -138,7 +150,7 @@ class Service: logger.info_on_rank_0("The {} hook function is successfully mounted to the model.".format(self.config.task)) if self.config.level in ["L0", "mix"]: if self.model is None: - logger.error_log_with_exp("The model is None.", MsaccException.INVALID_PARAM_ERROR) + logger.error_log_with_exp("The model is None.", MsprobeException.INVALID_PARAM_ERROR) logger.info_on_rank_0("The init dump mode is enabled, and the module dump function will not be available") for name, module in self.model.named_modules(): if module == self.model: @@ -164,4 +176,4 @@ class Service: api_register.api_modularity() if Const.STATISTICS == self.config.task or Const.TENSOR == self.config.task: - remove_dropout() + remove_dropout() \ No newline at end of file diff --git a/debug/accuracy_tools/atat/test/core_ut/test_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py similarity index 90% rename from debug/accuracy_tools/atat/test/core_ut/test_utils.py rename to debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py index b3273358e43..edd3eb53dcc 100644 --- a/debug/accuracy_tools/atat/test/core_ut/test_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py @@ -20,9 +20,9 @@ import uuid from unittest import TestCase from unittest.mock import patch, MagicMock, mock_open -from atat.core.common.log import logger -from atat.core.common.const import Const -from atat.core.common.utils import (CompareException, +from msprobe.core.common.log import logger +from msprobe.core.common.const import Const +from msprobe.core.common.utils import (CompareException, check_seed_all, check_inplace_op, make_dump_path_if_not_exists, @@ -41,7 +41,7 @@ from atat.core.common.utils import (CompareException, check_regex_prefix_format_valid, get_dump_data_path, task_dumppath_get) -from atat.core.common.file_check import FileCheckConst +from msprobe.core.common.file_check import FileCheckConst class TestUtils(TestCase): @@ -88,7 +88,7 @@ class TestUtils(TestCase): raise OSError if not os.path.exists(dirname): - with patch("atat.core.common.utils.Path.mkdir", new=test_mkdir): + with patch("msprobe.core.common.utils.Path.mkdir", new=test_mkdir): with self.assertRaises(CompareException) as context: make_dump_path_if_not_exists(dirname) self.assertEqual(context.exception.code, CompareException.INVALID_PATH_ERROR) @@ -171,7 +171,7 @@ class TestUtils(TestCase): file_path = os.path.realpath(__file__) dirname = os.path.dirname(file_path) - with patch("atat.core.common.utils.FileChecker", new=TestFileChecker): + with patch("msprobe.core.common.utils.FileChecker", new=TestFileChecker): check_file_or_directory_path(file_path, isdir=False) self.assertTrue(TestFileChecker.checked) self.assertEqual(TestFileChecker.file_path, file_path) @@ -179,7 +179,7 @@ class TestUtils(TestCase): self.assertEqual(TestFileChecker.ability, FileCheckConst.READ_ABLE) TestFileChecker.checked = False - with patch("atat.core.common.utils.FileChecker", new=TestFileChecker): + with patch("msprobe.core.common.utils.FileChecker", new=TestFileChecker): check_file_or_directory_path(dirname, isdir=True) self.assertTrue(TestFileChecker.checked) self.assertEqual(TestFileChecker.file_path, dirname) @@ -216,9 +216,9 @@ class TestUtils(TestCase): mock_check_file_or_directory_path = MagicMock() mock_check_json_file = MagicMock() - with patch("atat.core.common.utils.FileOpen", mock_open(read_data="")), \ - patch("atat.core.common.utils.check_json_file", new=mock_check_json_file), \ - patch("atat.core.common.utils.check_file_or_directory_path", new=mock_check_file_or_directory_path): + with patch("msprobe.core.common.utils.FileOpen", mock_open(read_data="")), \ + patch("msprobe.core.common.utils.check_json_file", new=mock_check_json_file), \ + patch("msprobe.core.common.utils.check_file_or_directory_path", new=mock_check_file_or_directory_path): check_compare_param(params, "output_path") check_compare_param(params, "output_path", summary_compare=False, md5_compare=True) for i in range(len(call_args)): @@ -261,7 +261,7 @@ class TestUtils(TestCase): _check_json(handler, "test.json") self.assertEqual(handler.string, "0_0") - @patch("atat.core.common.utils._check_json") + @patch("msprobe.core.common.utils._check_json") def test_check_json_file(self, _mock_check_json): input_param = { "npu_json_path": "npu_json_path", @@ -275,7 +275,7 @@ class TestUtils(TestCase): @patch.object(logger, "error") def test_check_file_size(self, mock_error): - with patch("atat.core.common.utils.os.path.getsize", return_value=120): + with patch("msprobe.core.common.utils.os.path.getsize", return_value=120): with self.assertRaises(CompareException) as context: check_file_size("input_file", 100) self.assertEqual(context.exception.code, CompareException.INVALID_FILE_ERROR) @@ -294,7 +294,7 @@ class TestUtils(TestCase): self.assertEqual(str(context.exception), f"prefix contains invalid characters, " f"prefix pattern {Const.REGEX_PREFIX_PATTERN}") - @patch("atat.core.common.utils.check_file_or_directory_path") + @patch("msprobe.core.common.utils.check_file_or_directory_path") def test_get_dump_data_path(self, mock_check_file_or_directory_path): file_path = os.path.realpath(__file__) dirname = os.path.dirname(file_path) @@ -322,23 +322,23 @@ class TestUtils(TestCase): mock_error.assert_called_with("Please check the json path is valid.") input_param["npu_json_path"] = "npu_json_path" - with patch("atat.core.common.utils.FileOpen", mock_open(read_data="")), \ - patch("atat.core.common.utils.json.load", return_value=npu_json): + with patch("msprobe.core.common.utils.FileOpen", mock_open(read_data="")), \ + patch("msprobe.core.common.utils.json.load", return_value=npu_json): summary_compare, md5_compare = task_dumppath_get(input_param) self.assertFalse(summary_compare) self.assertFalse(md5_compare) npu_json["task"] = Const.STATISTICS - with patch("atat.core.common.utils.FileOpen", mock_open(read_data="")), \ - patch("atat.core.common.utils.json.load", return_value=npu_json), \ - patch("atat.core.common.utils.md5_find", return_value=True): + with patch("msprobe.core.common.utils.FileOpen", mock_open(read_data="")), \ + patch("msprobe.core.common.utils.json.load", return_value=npu_json), \ + patch("msprobe.core.common.utils.md5_find", return_value=True): summary_compare, md5_compare = task_dumppath_get(input_param) self.assertFalse(summary_compare) self.assertTrue(md5_compare) npu_json["task"] = Const.OVERFLOW_CHECK - with patch("atat.core.common.utils.FileOpen", mock_open(read_data="")), \ - patch("atat.core.common.utils.json.load", return_value=npu_json): + with patch("msprobe.core.common.utils.FileOpen", mock_open(read_data="")), \ + patch("msprobe.core.common.utils.json.load", return_value=npu_json): with self.assertRaises(CompareException) as context: task_dumppath_get(input_param) self.assertEqual(context.exception.code, CompareException.INVALID_TASK_ERROR) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_data_collector.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_data_collector.py new file mode 100644 index 00000000000..eedbe5be7e0 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_data_collector.py @@ -0,0 +1,47 @@ +import unittest +from unittest.mock import patch, mock_open, MagicMock + +from msprobe.core.common.utils import Const +from msprobe.core.data_dump.data_collector import DataCollector +from msprobe.pytorch.debugger.debugger_config import DebuggerConfig +from msprobe.pytorch.pt_config import parse_json_config + + +class TestDataCollector(unittest.TestCase): + def setUp(self): + mock_json_data = { + "dump_path": "./ut_dump", + } + with patch("msprobe.pytorch.pt_config.FileOpen", mock_open(read_data='')), \ + patch("msprobe.pytorch.pt_config.json.load", return_value=mock_json_data): + common_config, task_config = parse_json_config("./config.json", Const.STATISTICS) + config = DebuggerConfig(common_config, task_config, Const.STATISTICS, "./ut_dump", "L1") + self.data_collector = DataCollector(config) + + def test_update_data(self): + self.data_collector.config.task = Const.OVERFLOW_CHECK + self.data_collector.data_processor.has_overflow = True + with patch("msprobe.core.data_dump.json_writer.DataWriter.update_data", return_value=None): + result1 = self.data_collector.update_data("test message", "test1:") + self.assertEqual(result1, "test1:Overflow detected.") + + self.data_collector.data_processor.has_overflow = False + result2 = self.data_collector.update_data("test message", "test2:") + self.assertEqual(result2, "test2:No Overflow, OK.") + + self.data_collector.config.task = Const.STATISTICS + self.data_collector.data_processor.has_overflow = True + with patch("msprobe.core.data_dump.json_writer.DataWriter.update_data", return_value=None): + result3 = self.data_collector.update_data("test message", "test3") + self.assertEqual(result3, "test3") + + def test_pre_forward_data_collect(self): + self.data_collector.check_scope_and_pid = MagicMock(return_value=False) + self.data_collector.is_inplace = MagicMock(return_value=False) + self.data_collector.data_processor.analyze_pre_forward = MagicMock() + name = "TestModule.forward" + pid = 123 + + self.data_collector.pre_forward_data_collect(name, None, pid, None) + self.data_collector.check_scope_and_pid.assert_called_once_with( + self.data_collector.scope, "TestModule.backward", 123) diff --git a/debug/accuracy_tools/atat/test/core_ut/data_dump/test_json_writer.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_json_writer.py similarity index 97% rename from debug/accuracy_tools/atat/test/core_ut/data_dump/test_json_writer.py rename to debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_json_writer.py index 867da001e61..cfb1b3d551a 100644 --- a/debug/accuracy_tools/atat/test/core_ut/data_dump/test_json_writer.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_json_writer.py @@ -1,10 +1,10 @@ import unittest -from atat.core.data_dump.json_writer import DataWriter +from msprobe.core.data_dump.json_writer import DataWriter import os import csv -from atat.core.common.file_check import FileOpen -from atat.core.common import utils +from msprobe.core.common.file_check import FileOpen +from msprobe.core.common import utils from pathlib import Path import json diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_scope.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_scope.py new file mode 100644 index 00000000000..1989fd0a95a --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/test_scope.py @@ -0,0 +1,151 @@ +import unittest +from unittest.mock import MagicMock + +from msprobe.core.common.exceptions import ScopeException +from msprobe.core.data_dump.scope import ( + build_scope, + build_range_scope_according_to_scope_name, + BaseScope, + ListScope, + RangeScope, + APIRangeScope, + ModuleRangeScope +) + + +class TestBuildScope(unittest.TestCase): + def test_build_scope(self): + scope_class = MagicMock() + result1 = build_scope(scope_class, None, None) + self.assertEqual(result1, None) + + api_list = ['api1', 'api2'] + result2 = build_scope(scope_class, None, api_list) + self.assertEqual(result2, scope_class.return_value) + + def test_build_range_scope_according_to_scope_name(self): + result = build_range_scope_according_to_scope_name([], []) + self.assertIsInstance(result, APIRangeScope) + + +class TestBaseScope(unittest.TestCase): + def test_rectify_args(self): + scope = [] + api_list = "invalid_api_list" + with self.assertRaises(ScopeException) as context: + BaseScope.rectify_args(scope, api_list) + self.assertEqual(context.exception.code, ScopeException.InvalidApiStr) + + api_list = [1, 2, 3] + with self.assertRaises(ScopeException) as context: + BaseScope.rectify_args(scope, api_list) + self.assertEqual(context.exception.code, ScopeException.InvalidApiStr) + + scope = "module1" + api_list = [] + + expected_scope = ["module1"] + expected_api_list = [] + result_scope, result_api_list = BaseScope.rectify_args(scope, api_list) + self.assertEqual(result_scope, expected_scope) + self.assertEqual(result_api_list, expected_api_list) + + scope = 123 + api_list = [] + with self.assertRaises(ScopeException) as context: + BaseScope.rectify_args(scope, api_list) + self.assertEqual(context.exception.code, ScopeException.InvalidScope) + + scope = ["module1", 2, "module3"] + api_list = [] + with self.assertRaises(ScopeException) as context: + BaseScope.rectify_args(scope, api_list) + self.assertEqual(context.exception.code, ScopeException.InvalidScope) + + +class TestListScope(unittest.TestCase): + def test_rectify_args(self): + scope = ["module1"] + api_list = ["api1"] + with self.assertRaises(ScopeException) as context: + ListScope.rectify_args(scope, api_list) + self.assertEqual(context.exception.code, ScopeException.ArgConflict) + + def test_check(self): + list_scope = ListScope([], []) + module_name = "module1" + result = list_scope.check(module_name) + self.assertTrue(result) + + list_scope = ListScope(["module1"], []) + module_name = "module1" + result = list_scope.check(module_name) + self.assertTrue(result) + + list_scope = ListScope(["module1"], []) + module_name = "module2" + result = list_scope.check(module_name) + self.assertFalse(result) + + +class TestRangeScope(unittest.TestCase): + def test_rectify_args(self): + scope = ["module1", "module2", "module3"] + with self.assertRaises(ScopeException) as context: + RangeScope.rectify_args(scope, []) + self.assertEqual(context.exception.code, ScopeException.InvalidScope) + + scope = ["module1"] + expected_scope = ["module1", "module1"] + result_scope, result_api_list = RangeScope.rectify_args(scope, []) + self.assertEqual(result_scope, expected_scope) + + +class TestAPIRangeScope(unittest.TestCase): + def test_check_scope_is_valid(self): + api_range_scope = APIRangeScope([], []) + result = api_range_scope.check_scope_is_valid() + self.assertTrue(result) + + def test_check(self): + api_range_scope = APIRangeScope([], []) + api_name = "api1" + result = api_range_scope.check(api_name) + self.assertTrue(result) + + +class TestModuleRangeScope(unittest.TestCase): + def test_check_scope_is_valid(self): + module_range_scope = ModuleRangeScope([], []) + result = module_range_scope.check_scope_is_valid() + self.assertTrue(result) + + def test_begin_module(self): + module_range_scope = ModuleRangeScope(["module1", "module2"], []) + module_name = "module1" + module_range_scope.begin_module(module_name) + self.assertTrue(module_range_scope.in_scope) + + module_range_scope = ModuleRangeScope(["module1", "module2"], []) + module_name = "module3" + module_range_scope.begin_module(module_name) + self.assertFalse(module_range_scope.in_scope) + + def test_end_module(self): + module_range_scope = ModuleRangeScope(["module1", "module2"], []) + module_name = "module2" + module_range_scope.in_scope = True + module_range_scope.end_module(module_name) + self.assertFalse(module_range_scope.in_scope) + + module_range_scope = ModuleRangeScope(["module1", "module2"], []) + module_name = "module3" + module_range_scope.in_scope = True + module_range_scope.end_module(module_name) + self.assertTrue(module_range_scope.in_scope) + + def test_check(self): + module_range_scope = ModuleRangeScope([], []) + module_name = "module1" + result = module_range_scope.check(module_name) + self.assertTrue(result) diff --git a/debug/accuracy_tools/atat/test/core_ut/test_common_config.py b/debug/accuracy_tools/msprobe/test/core_ut/test_common_config.py similarity index 83% rename from debug/accuracy_tools/atat/test/core_ut/test_common_config.py rename to debug/accuracy_tools/msprobe/test/core_ut/test_common_config.py index 00b17e1f1cf..8b2138a485b 100644 --- a/debug/accuracy_tools/atat/test/core_ut/test_common_config.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/test_common_config.py @@ -17,10 +17,10 @@ from unittest import TestCase from unittest.mock import patch -from atat.core.common.log import logger -from atat.core.common.const import Const -from atat.core.common.exceptions import MsaccException -from atat.core.common_config import CommonConfig, BaseConfig +from msprobe.core.common.log import logger +from msprobe.core.common.const import Const +from msprobe.core.common.exceptions import MsprobeException +from msprobe.core.common_config import CommonConfig, BaseConfig class TestCommonConfig(TestCase): @@ -44,7 +44,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "task is invalid, it should be one of {}".format(Const.TASK_LIST)) self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"task": Const.TENSOR}) json_config.update({"rank": 0}) @@ -52,7 +52,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "rank is invalid, it should be a list") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"task": Const.TENSOR}) json_config.update({"rank": [0]}) @@ -61,7 +61,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "step is invalid, it should be a list") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"task": Const.TENSOR}) json_config.update({"rank": [0]}) @@ -71,7 +71,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "level is invalid, it should be one of {}".format(Const.LEVEL_LIST)) self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"task": Const.TENSOR}) json_config.update({"rank": [0]}) @@ -82,7 +82,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "seed is invalid, it should be an integer") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"task": Const.TENSOR}) json_config.update({"rank": [0]}) @@ -94,7 +94,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "is_deterministic is invalid, it should be a boolean") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"task": Const.TENSOR}) json_config.update({"rank": [0]}) @@ -107,7 +107,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "enable_dataloader is invalid, it should be a boolean") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) @patch.object(logger, "error_log_with_exp") def test_base_config(self, mock_error_log_with_exp): @@ -121,7 +121,7 @@ class TestCommonConfig(TestCase): self.assertIsNone(base_config.backward_input) self.assertIsNone(base_config.file_format) self.assertIsNone(base_config.summary_mode) - self.assertIsNone(base_config.overflow_num) + self.assertIsNone(base_config.overflow_nums) self.assertIsNone(base_config.check_mode) json_config.update({"scope": "Tensor_Add"}) @@ -130,7 +130,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "scope is invalid, it should be a list") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"scope": ["Tensor_Add"]}) json_config.update({"list": "Tensor_Add"}) @@ -139,7 +139,7 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "list is invalid, it should be a list") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) json_config.update({"scope": ["Tensor_Add"]}) json_config.update({"list": ["Tensor_Add"]}) @@ -149,4 +149,4 @@ class TestCommonConfig(TestCase): self.assertEqual(mock_error_log_with_exp.call_args[0][0], "data_mode is invalid, it should be a list") self.assertEqual(str(mock_error_log_with_exp.call_args[0][1]), - MsaccException.err_strs.get(MsaccException.INVALID_PARAM_ERROR)) + MsprobeException.err_strs.get(MsprobeException.INVALID_PARAM_ERROR)) diff --git a/debug/accuracy_tools/atat/test/core_ut/test_file_check.py b/debug/accuracy_tools/msprobe/test/core_ut/test_file_check.py similarity index 85% rename from debug/accuracy_tools/atat/test/core_ut/test_file_check.py rename to debug/accuracy_tools/msprobe/test/core_ut/test_file_check.py index aa7882aa590..ecdf3da9fed 100644 --- a/debug/accuracy_tools/atat/test/core_ut/test_file_check.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/test_file_check.py @@ -19,10 +19,10 @@ import os from unittest import TestCase from unittest.mock import patch, MagicMock -from atat.core.common.log import logger -from atat.core.common.const import FileCheckConst -from atat.core.common.exceptions import FileCheckException -from atat.core.common.file_check import (check_link, +from msprobe.core.common.log import logger +from msprobe.core.common.const import FileCheckConst +from msprobe.core.common.exceptions import FileCheckException +from msprobe.core.common.file_check import (check_link, check_path_length, check_path_exists, check_path_readability, @@ -40,7 +40,7 @@ from atat.core.common.file_check import (check_link, class TestFileCheckUtil(TestCase): @patch.object(logger, "error") def test_check_link(self, mock_logger_error): - with patch("atat.core.common.file_check.os.path.islink", return_value=True): + with patch("msprobe.core.common.file_check.os.path.islink", return_value=True): with self.assertRaises(FileCheckException) as context: check_link("link_path") self.assertEqual(str(context.exception), @@ -72,7 +72,7 @@ class TestFileCheckUtil(TestCase): @patch.object(logger, "error") def test_check_path_exists(self, mock_logger_error): - with patch("atat.core.common.file_check.os.path.exists", return_value=False): + with patch("msprobe.core.common.file_check.os.path.exists", return_value=False): with self.assertRaises(FileCheckException) as context: check_path_exists("file_path") self.assertEqual(str(context.exception), @@ -82,7 +82,7 @@ class TestFileCheckUtil(TestCase): @patch.object(logger, "error") def test_check_path_readability(self, mock_logger_error): path = "file_path" - with patch("atat.core.common.file_check.os.access", return_value=False): + with patch("msprobe.core.common.file_check.os.access", return_value=False): with self.assertRaises(FileCheckException) as context: check_path_readability(path) self.assertEqual(str(context.exception), @@ -91,14 +91,14 @@ class TestFileCheckUtil(TestCase): mock_access = MagicMock() mock_access.return_value = True - with patch("atat.core.common.file_check.os.access", new=mock_access): + with patch("msprobe.core.common.file_check.os.access", new=mock_access): check_path_readability(path) self.assertEqual(mock_access.call_args[0], (path, os.R_OK)) @patch.object(logger, "error") def test_check_path_writability(self, mock_logger_error): path = "file_path" - with patch("atat.core.common.file_check.os.access", return_value=False): + with patch("msprobe.core.common.file_check.os.access", return_value=False): with self.assertRaises(FileCheckException) as context: check_path_writability(path) self.assertEqual(str(context.exception), @@ -107,14 +107,14 @@ class TestFileCheckUtil(TestCase): mock_access = MagicMock() mock_access.return_value = True - with patch("atat.core.common.file_check.os.access", new=mock_access): + with patch("msprobe.core.common.file_check.os.access", new=mock_access): check_path_writability(path) self.assertEqual(mock_access.call_args[0], (path, os.W_OK)) @patch.object(logger, "error") def test_check_path_executable(self, mock_logger_error): path = "file_path" - with patch("atat.core.common.file_check.os.access", return_value=False): + with patch("msprobe.core.common.file_check.os.access", return_value=False): with self.assertRaises(FileCheckException) as context: check_path_executable(path) self.assertEqual(str(context.exception), @@ -123,7 +123,7 @@ class TestFileCheckUtil(TestCase): mock_access = MagicMock() mock_access.return_value = True - with patch("atat.core.common.file_check.os.access", new=mock_access): + with patch("msprobe.core.common.file_check.os.access", new=mock_access): check_path_executable(path) self.assertEqual(mock_access.call_args[0], (path, os.X_OK)) @@ -135,7 +135,7 @@ class TestFileCheckUtil(TestCase): path = "file_path" mock_stat = TestStat(0o002) - with patch("atat.core.common.file_check.os.stat", return_value=mock_stat): + with patch("msprobe.core.common.file_check.os.stat", return_value=mock_stat): with self.assertRaises(FileCheckException) as context: check_other_user_writable(path) self.assertEqual(str(context.exception), @@ -147,7 +147,7 @@ class TestFileCheckUtil(TestCase): def test_check_path_owner_consistent(self, mock_logger_error): file_path = os.path.realpath(__file__) file_owner = os.stat(file_path).st_uid - with patch("atat.core.common.file_check.os.getuid", return_value=file_owner+1): + with patch("msprobe.core.common.file_check.os.getuid", return_value=file_owner+1): with self.assertRaises(FileCheckException) as context: check_path_owner_consistent(file_path) self.assertEqual(str(context.exception), @@ -160,7 +160,7 @@ class TestFileCheckUtil(TestCase): path = "path" mock_re_match = MagicMock() mock_re_match.return_value = False - with patch("atat.core.common.file_check.re.match", new=mock_re_match): + with patch("msprobe.core.common.file_check.re.match", new=mock_re_match): with self.assertRaises(FileCheckException) as context: check_path_pattern_vaild(path) self.assertEqual(str(context.exception), @@ -181,8 +181,8 @@ class TestFileCheckUtil(TestCase): def test_check_common_file_size(self): mock_check_file_size = MagicMock() - with patch("atat.core.common.file_check.os.path.isfile", return_value=True), \ - patch("atat.core.common.file_check.check_file_size", new=mock_check_file_size): + with patch("msprobe.core.common.file_check.os.path.isfile", return_value=True), \ + patch("msprobe.core.common.file_check.check_file_size", new=mock_check_file_size): for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): check_common_file_size(suffix) mock_check_file_size.assert_called_with(suffix, max_size) @@ -201,16 +201,16 @@ class TestFileCheckUtil(TestCase): def test_check_path_type(self, mock_logger_error): file_path = "file_path" - with patch("atat.core.common.file_check.os.path.isfile", return_value=False), \ - patch("atat.core.common.file_check.os.path.isdir", return_value=True): + with patch("msprobe.core.common.file_check.os.path.isfile", return_value=False), \ + patch("msprobe.core.common.file_check.os.path.isdir", return_value=True): with self.assertRaises(FileCheckException) as context: check_path_type(file_path, FileCheckConst.FILE) self.assertEqual(str(context.exception), FileCheckException.err_strs.get(FileCheckException.INVALID_FILE_ERROR)) mock_logger_error.assert_called_with(f"The {file_path} should be a file!") - with patch("atat.core.common.file_check.os.path.isfile", return_value=True), \ - patch("atat.core.common.file_check.os.path.isdir", return_value=False): + with patch("msprobe.core.common.file_check.os.path.isfile", return_value=True), \ + patch("msprobe.core.common.file_check.os.path.isdir", return_value=False): with self.assertRaises(FileCheckException) as context: check_path_type(file_path, FileCheckConst.DIR) self.assertEqual(str(context.exception), diff --git a/debug/accuracy_tools/atat/test/core_ut/test_log.py b/debug/accuracy_tools/msprobe/test/core_ut/test_log.py similarity index 92% rename from debug/accuracy_tools/atat/test/core_ut/test_log.py rename to debug/accuracy_tools/msprobe/test/core_ut/test_log.py index 6d7998d5ae0..1687c48d025 100644 --- a/debug/accuracy_tools/atat/test/core_ut/test_log.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/test_log.py @@ -17,11 +17,11 @@ from unittest import TestCase from unittest.mock import patch, MagicMock -from atat.core.common.log import BaseLogger, logger +from msprobe.core.common.log import BaseLogger, logger class TestLog(TestCase): - @patch("atat.core.common.log.print") + @patch("msprobe.core.common.log.print") def test__print_log(self, mock_print): logger._print_log("level", "msg") self.assertIn("[level] msg", mock_print.call_args[0][0]) @@ -75,7 +75,7 @@ class TestLog(TestCase): @patch.object(BaseLogger, "get_rank") def test_info_on_rank_0(self, mock_get_rank): mock_print = MagicMock() - with patch("atat.core.common.log.print", new=mock_print): + with patch("msprobe.core.common.log.print", new=mock_print): mock_get_rank.return_value = 0 logger.info_on_rank_0("msg") self.assertIn("[INFO] msg", mock_print.call_args[0][0]) @@ -87,7 +87,7 @@ class TestLog(TestCase): @patch.object(BaseLogger, "get_rank") def test_error_on_rank_0(self, mock_get_rank): mock_print = MagicMock() - with patch("atat.core.common.log.print", new=mock_print): + with patch("msprobe.core.common.log.print", new=mock_print): mock_get_rank.return_value = 0 logger.error_on_rank_0("msg") self.assertIn("[ERROR] msg", mock_print.call_args[0][0]) @@ -99,7 +99,7 @@ class TestLog(TestCase): @patch.object(BaseLogger, "get_rank") def test_warning_on_rank_0(self, mock_get_rank): mock_print = MagicMock() - with patch("atat.core.common.log.print", new=mock_print): + with patch("msprobe.core.common.log.print", new=mock_print): mock_get_rank.return_value = 0 logger.warning_on_rank_0("msg") self.assertIn("[WARNING] msg", mock_print.call_args[0][0]) diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_api_kbk_dump.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_api_kbk_dump.py similarity index 75% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_api_kbk_dump.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_api_kbk_dump.py index 47d60999b16..7411018ff08 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_api_kbk_dump.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_api_kbk_dump.py @@ -19,9 +19,9 @@ import os from unittest import TestCase from unittest.mock import patch -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.dump.api_kbk_dump import ApiKbkDump +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.dump.api_kbk_dump import ApiKbkDump class TestApiKbkDump(TestCase): @@ -42,10 +42,10 @@ class TestApiKbkDump(TestCase): self.assertEqual(dumper.dump_json["common_dump_settings"]["iteration"], "0|2") os.environ["MS_ACL_DUMP_CFG_PATH"] = "path" - with patch("atat.mindspore.dump.api_kbk_dump.make_dump_path_if_not_exists"), \ - patch("atat.mindspore.dump.api_kbk_dump.FileOpen"), \ - patch("atat.mindspore.dump.api_kbk_dump.json.dump"), \ - patch("atat.mindspore.dump.api_kbk_dump.logger.info"): + with patch("msprobe.mindspore.dump.api_kbk_dump.make_dump_path_if_not_exists"), \ + patch("msprobe.mindspore.dump.api_kbk_dump.FileOpen"), \ + patch("msprobe.mindspore.dump.api_kbk_dump.json.dump"), \ + patch("msprobe.mindspore.dump.api_kbk_dump.logger.info"): dumper.handle() self.assertEqual(os.environ.get("GRAPH_OP_RUN"), "1") self.assertEqual(os.environ.get("MS_ACL_DUMP_CFG_PATH"), None) diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_debugger_config.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_debugger_config.py similarity index 87% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_debugger_config.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_debugger_config.py index 3bdf341c397..5187d3951c0 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_debugger_config.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_debugger_config.py @@ -16,9 +16,9 @@ """ from unittest import TestCase -from atat.core.common.const import Const -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.core.common.const import Const +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig class TestDebuggerConfig(TestCase): @@ -27,7 +27,7 @@ class TestDebuggerConfig(TestCase): "dump_path": "/absolute_path", "rank": [], "step": [], - "level": "L1" + "level": "L0" } common_config = CommonConfig(json_config) task_config = BaseConfig(json_config) diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_dump_tool_factory.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_dump_tool_factory.py similarity index 89% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_dump_tool_factory.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_dump_tool_factory.py index f6626f551fe..fb88d7bbbf3 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_dump_tool_factory.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_dump_tool_factory.py @@ -16,9 +16,9 @@ """ from unittest import TestCase -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.dump.dump_tool_factory import DumpToolFactory +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory class TestDumpToolFactory(TestCase): diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_kernel_graph_dump.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_kernel_graph_dump.py similarity index 80% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_kernel_graph_dump.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_kernel_graph_dump.py index 6c59521a17d..e691a2c7edd 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_kernel_graph_dump.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_kernel_graph_dump.py @@ -19,9 +19,9 @@ import os from unittest import TestCase from unittest.mock import patch -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.dump.kernel_graph_dump import KernelGraphDump +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump class TestKernelGraphDump(TestCase): @@ -45,10 +45,10 @@ class TestKernelGraphDump(TestCase): self.assertEqual(dumper.dump_json["common_dump_settings"]["file_format"], "bin") self.assertEqual(dumper.dump_json["common_dump_settings"]["input_output"], 2) - with patch("atat.mindspore.dump.kernel_graph_dump.make_dump_path_if_not_exists"), \ - patch("atat.mindspore.dump.kernel_graph_dump.FileOpen"), \ - patch("atat.mindspore.dump.kernel_graph_dump.json.dump"), \ - patch("atat.mindspore.dump.kernel_graph_dump.logger.info"): + with patch("msprobe.mindspore.dump.kernel_graph_dump.make_dump_path_if_not_exists"), \ + patch("msprobe.mindspore.dump.kernel_graph_dump.FileOpen"), \ + patch("msprobe.mindspore.dump.kernel_graph_dump.json.dump"), \ + patch("msprobe.mindspore.dump.kernel_graph_dump.logger.info"): os.environ["GRAPH_OP_RUN"] = "1" with self.assertRaises(Exception) as context: diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_kernel_graph_overflow_check.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py similarity index 76% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_kernel_graph_overflow_check.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py index 101482458dc..a93fab021ab 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_kernel_graph_overflow_check.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py @@ -19,9 +19,9 @@ import os from unittest import TestCase from unittest.mock import patch -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck class TestKernelGraphOverflowCheck(TestCase): @@ -43,10 +43,10 @@ class TestKernelGraphOverflowCheck(TestCase): self.assertEqual(checker.dump_json["common_dump_settings"]["op_debug_mode"], 2) os.environ["MS_ACL_DUMP_CFG_PATH"] = "path" - with patch("atat.mindspore.overflow_check.kernel_graph_overflow_check.make_dump_path_if_not_exists"), \ - patch("atat.mindspore.overflow_check.kernel_graph_overflow_check.FileOpen"), \ - patch("atat.mindspore.overflow_check.kernel_graph_overflow_check.json.dump"), \ - patch("atat.mindspore.overflow_check.kernel_graph_overflow_check.logger.info"): + with patch("msprobe.mindspore.overflow_check.kernel_graph_overflow_check.make_dump_path_if_not_exists"), \ + patch("msprobe.mindspore.overflow_check.kernel_graph_overflow_check.FileOpen"), \ + patch("msprobe.mindspore.overflow_check.kernel_graph_overflow_check.json.dump"), \ + patch("msprobe.mindspore.overflow_check.kernel_graph_overflow_check.logger.info"): os.environ["GRAPH_OP_RUN"] = "1" with self.assertRaises(Exception) as context: diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_ms_config.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_ms_config.py similarity index 83% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_ms_config.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_ms_config.py index 3dc3670128c..30212d95e62 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_ms_config.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_ms_config.py @@ -17,9 +17,9 @@ from unittest import TestCase from unittest.mock import patch, mock_open -from atat.core.common.const import Const -from atat.mindspore.ms_config import (parse_json_config, parse_task_config, - TensorConfig, StatisticsConfig, OverflowCheck) +from msprobe.core.common.const import Const +from msprobe.mindspore.ms_config import (parse_json_config, parse_task_config, + TensorConfig, StatisticsConfig, OverflowCheckConfig) class TestMsConfig(TestCase): @@ -37,8 +37,8 @@ class TestMsConfig(TestCase): "summary_mode": "statistics" } } - with patch("atat.mindspore.ms_config.FileOpen", mock_open(read_data='')), \ - patch("atat.mindspore.ms_config.json.load", return_value=mock_json_data): + with patch("msprobe.mindspore.ms_config.FileOpen", mock_open(read_data='')), \ + patch("msprobe.mindspore.ms_config.json.load", return_value=mock_json_data): common_config, task_config = parse_json_config("./config.json") self.assertEqual(common_config.task, Const.STATISTICS) self.assertEqual(task_config.data_mode, ["all"]) @@ -62,7 +62,7 @@ class TestMsConfig(TestCase): self.assertTrue(isinstance(task_config, StatisticsConfig)) task_config = parse_task_config("overflow_check", mock_json_config) - self.assertTrue(isinstance(task_config, OverflowCheck)) + self.assertTrue(isinstance(task_config, OverflowCheckConfig)) with self.assertRaises(Exception) as context: parse_task_config("free_benchmark", mock_json_config) diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_overflow_check_tool_factory.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py similarity index 88% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_overflow_check_tool_factory.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py index 497fe1376ab..47da051d4fd 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_overflow_check_tool_factory.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py @@ -16,9 +16,9 @@ """ from unittest import TestCase -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory class TestOverflowCheckToolFactory(TestCase): diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_precision_debugger.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_precision_debugger.py similarity index 79% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_precision_debugger.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_precision_debugger.py index 834a58e41a4..425ed3040dc 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_precision_debugger.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_precision_debugger.py @@ -17,9 +17,9 @@ from unittest import TestCase from unittest.mock import patch -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.debugger.precision_debugger import PrecisionDebugger +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.debugger.precision_debugger import PrecisionDebugger class TestPrecisionDebugger(TestCase): @@ -35,16 +35,16 @@ class TestPrecisionDebugger(TestCase): "dump_path": "/absolute_path", "rank": [], "step": [], - "level": "L1" + "level": "L0" } common_config = CommonConfig(json_config) task_config = BaseConfig(json_config) handler = Handler() - with patch("atat.mindspore.debugger.precision_debugger.parse_json_config", + with patch("msprobe.mindspore.debugger.precision_debugger.parse_json_config", return_value=[common_config, task_config]), \ - patch("atat.mindspore.debugger.precision_debugger.TaskHandlerFactory.create", return_value=handler): + patch("msprobe.mindspore.debugger.precision_debugger.TaskHandlerFactory.create", return_value=handler): debugger = PrecisionDebugger() debugger.start() self.assertTrue(isinstance(debugger.config, DebuggerConfig)) diff --git a/debug/accuracy_tools/atat/test/mindspore_ut/test_task_handler_factory.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_task_handler_factory.py similarity index 82% rename from debug/accuracy_tools/atat/test/mindspore_ut/test_task_handler_factory.py rename to debug/accuracy_tools/msprobe/test/mindspore_ut/test_task_handler_factory.py index 02cd9934cb1..41be7b1db6c 100644 --- a/debug/accuracy_tools/atat/test/mindspore_ut/test_task_handler_factory.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/test_task_handler_factory.py @@ -17,10 +17,10 @@ from unittest import TestCase from unittest.mock import patch -from atat.core.common_config import CommonConfig, BaseConfig -from atat.mindspore.debugger.debugger_config import DebuggerConfig -from atat.mindspore.dump.kernel_graph_dump import KernelGraphDump -from atat.mindspore.task_handler_factory import TaskHandlerFactory +from msprobe.core.common_config import CommonConfig, BaseConfig +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump +from msprobe.mindspore.task_handler_factory import TaskHandlerFactory class TestTaskHandlerFactory(TestCase): @@ -47,7 +47,7 @@ class TestTaskHandlerFactory(TestCase): handler = TaskHandlerFactory.create(config) self.assertTrue(isinstance(handler, KernelGraphDump)) - with patch("atat.mindspore.task_handler_factory.TaskHandlerFactory.tasks", new=tasks): + with patch("msprobe.mindspore.task_handler_factory.TaskHandlerFactory.tasks", new=tasks): with self.assertRaises(Exception) as context: TaskHandlerFactory.create(config) self.assertEqual(str(context.exception), "Can not find task handler") diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/advisor/test_advisor.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/advisor/test_advisor.py similarity index 85% rename from debug/accuracy_tools/atat/test/pytorch_ut/advisor/test_advisor.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/advisor/test_advisor.py index 78e5b489e7a..176b80068f7 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/advisor/test_advisor.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/advisor/test_advisor.py @@ -2,12 +2,13 @@ import difflib import os import shutil import unittest +import logging from unittest.mock import patch import pandas -from atat.pytorch.advisor.advisor import Advisor -from atat.pytorch.advisor.advisor_const import AdvisorConst +from msprobe.pytorch.advisor.advisor import Advisor +from msprobe.pytorch.advisor.advisor_const import AdvisorConst class TestAdvisor(unittest.TestCase): @@ -70,11 +71,11 @@ class TestAdvisor(unittest.TestCase): output_content = out_file.read().splitlines() result = list(difflib.unified_diff(standard_content, output_content, n=0)) if result: - print('\n\n-------------------------------------------------------------------------', flush=True) - print(f'[ERROR] {output_file.replace(self.output_path, "")} advisor summary are inconsistent.', - flush=True) - print('\n'.join(result), flush=True) - print('-------------------------------------------------------------------------', flush=True) + logging.basicConfig(level=logging.INFO) + logging.info('\n\n-------------------------------------------------------------------------') + logging.error(f'[ERROR] {output_file.replace(self.output_path, "")} advisor summary are inconsistent.') + logging.error('\n'.join(result)) + logging.info('\n\n-------------------------------------------------------------------------') self.has_error = True diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py similarity index 96% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py index 16d0c0bc127..56d100f0a1b 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py @@ -1,12 +1,12 @@ import unittest from unittest.mock import patch -from atat.pytorch.api_accuracy_checker.common.utils import * +from msprobe.pytorch.api_accuracy_checker.common.utils import * class TestUtils(unittest.TestCase): - @patch('atat.pytorch.api_accuracy_checker.common.utils.get_file_content_bytes') + @patch('msprobe.pytorch.api_accuracy_checker.common.utils.get_file_content_bytes') def test_get_json_contents_should_raise_exception(self, mock_get_file_content_bytes): mock_get_file_content_bytes.return_value = 'not a dict' with self.assertRaises(CompareException) as ce: diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/common/test_config.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py similarity index 92% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/common/test_config.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py index 066e74aa518..35fc6164763 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/common/test_config.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py @@ -2,7 +2,7 @@ import unittest import os from unittest.mock import patch -from atat.pytorch.api_accuracy_checker.common.config import Config +from msprobe.pytorch.api_accuracy_checker.common.config import Config class TestConfig(unittest.TestCase): @@ -35,5 +35,5 @@ class TestConfig(unittest.TestCase): validate_white_list = ['conv1d', 'max_pool1d', 'dropout', '__add__'] self.assertEqual(self.cfg.validate('white_list', validate_white_list), validate_white_list) - with self.assertRaises(ValueError): + with self.assertRaises(Exception): self.cfg.validate('white_list', ['invalid_api1', 'invalid_api2']) diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py similarity index 98% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py index 9604e7a681c..35a8b9f1fa5 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py @@ -2,7 +2,7 @@ import unittest import numpy as np -from atat.pytorch.api_accuracy_checker.compare import algorithm as alg +from msprobe.pytorch.api_accuracy_checker.compare import algorithm as alg class TestAlgorithmMethods(unittest.TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py similarity index 94% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py index 7717d826577..540460d0896 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py @@ -2,14 +2,14 @@ import unittest import pandas as pd -from atat.pytorch.api_accuracy_checker.compare.api_precision_compare import ( +from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import ( CompareConfig, BenchmarkStandard, check_csv_columns, check_error_rate, get_api_checker_result, ) -from atat.core.common.const import CompareConst +from msprobe.core.common.const import CompareConst class TestApiPrecisionCompare(unittest.TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py similarity index 96% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py index 2c97471c7ab..e1e6d51de29 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py @@ -7,9 +7,9 @@ import unittest import numpy as np import torch.nn.functional -from atat.pytorch.api_accuracy_checker.compare.compare import Comparator -from atat.pytorch.api_accuracy_checker.compare.compare_column import CompareColumn -from atat.pytorch.api_accuracy_checker.run_ut.run_ut import UtDataInfo +from msprobe.pytorch.api_accuracy_checker.compare.compare import Comparator +from msprobe.pytorch.api_accuracy_checker.compare.compare_column import CompareColumn +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import UtDataInfo current_time = time.strftime("%Y%m%d%H%M%S") RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv" diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py similarity index 68% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py index ee25a25e74d..782321868a8 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py @@ -1,6 +1,6 @@ import unittest -from atat.pytorch.api_accuracy_checker.compare.compare_column import ApiPrecisionOutputColumn +from msprobe.pytorch.api_accuracy_checker.compare.compare_column import ApiPrecisionOutputColumn class TestCompareColumns(unittest.TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py similarity index 88% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py index 93f3c2c73e1..ac9c974ea3e 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py @@ -2,8 +2,8 @@ import unittest import numpy as np -from atat.pytorch.api_accuracy_checker.common.utils import CompareException -from atat.pytorch.api_accuracy_checker.compare.compare_utils import check_dtype_comparable, convert_str_to_float +from msprobe.pytorch.api_accuracy_checker.common.utils import CompareException +from msprobe.pytorch.api_accuracy_checker.compare.compare_utils import check_dtype_comparable, convert_str_to_float class TestCompareUtils(unittest.TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json similarity index 100% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json similarity index 100% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py similarity index 96% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py index f47c71c984f..f664dad197f 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py @@ -3,8 +3,8 @@ import os import unittest import copy -from atat.pytorch.api_accuracy_checker.run_ut.data_generate import * -from atat.pytorch.api_accuracy_checker.common.utils import get_json_contents +from msprobe.pytorch.api_accuracy_checker.run_ut.data_generate import * +from msprobe.pytorch.api_accuracy_checker.common.utils import get_json_contents base_dir = os.path.dirname(os.path.realpath(__file__)) forward_file = os.path.join(base_dir, "forward.json") diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py similarity index 83% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py index 6a9071f15ea..27126cdddda 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py @@ -5,7 +5,7 @@ import logging from unittest.mock import patch, mock_open, MagicMock import json import signal -from atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut import split_json_file, signal_handler, run_parallel_ut, \ +from msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut import split_json_file, signal_handler, run_parallel_ut, \ prepare_config, main, ParallelUTConfig @@ -20,7 +20,7 @@ class TestMultiRunUT(unittest.TestCase): {'key3': 'TRUE', 'key4': 'TRUE'} ] - @patch('atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut.FileOpen') + @patch('msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut.FileOpen') def test_split_json_file(self, mock_FileOpen): mock_FileOpen.return_value.__enter__.return_value = mock_open(read_data=self.test_json_content).return_value num_splits = 2 @@ -48,7 +48,7 @@ class TestMultiRunUT(unittest.TestCase): device_id=[0, 1], result_csv_path='result.csv', total_items=2, - real_data_path=None + config_path=None ) mock_file.side_effect = [ @@ -63,10 +63,10 @@ class TestMultiRunUT(unittest.TestCase): @patch('os.remove') @patch('os.path.realpath', side_effect=lambda x: x) - @patch('atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut.check_link') - @patch('atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut.check_file_suffix') - @patch('atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut.FileChecker') - @patch('atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut.split_json_file', + @patch('msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut.check_link') + @patch('msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut.check_file_suffix') + @patch('msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut.FileChecker') + @patch('msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut.split_json_file', return_value=(['forward_split1.json', 'forward_split2.json'], 2)) def test_prepare_config(self, mock_split_json_file, mock_FileChecker, mock_check_file_suffix, mock_check_link, mock_realpath, mock_remove): @@ -81,7 +81,7 @@ class TestMultiRunUT(unittest.TestCase): args.jit_compile = False args.device_id = [0, 1] args.result_csv_path = None - args.real_data_path = None + args.config_path = None config = prepare_config(args) @@ -93,8 +93,8 @@ class TestMultiRunUT(unittest.TestCase): @patch('argparse.ArgumentParser.parse_args') - @patch('atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut.prepare_config') - @patch('atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut.run_parallel_ut') + @patch('msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut.prepare_config') + @patch('msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut.run_parallel_ut') def test_main(self, mock_run_parallel_ut, mock_prepare_config, mock_parse_args): main() mock_parse_args.assert_called() diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py similarity index 95% rename from debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py index 97dccd2b58f..bc643794ab6 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py @@ -4,8 +4,8 @@ import copy import unittest import torch from unittest.mock import patch, DEFAULT -from atat.pytorch.api_accuracy_checker.run_ut.run_ut import * -from atat.pytorch.api_accuracy_checker.common.utils import get_json_contents +from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import * +from msprobe.pytorch.api_accuracy_checker.common.utils import get_json_contents base_dir = os.path.dirname(os.path.realpath(__file__)) forward_file = os.path.join(base_dir, "forward.json") diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_acc_compare.py new file mode 100644 index 00000000000..288e259c0aa --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_acc_compare.py @@ -0,0 +1,267 @@ +# coding=utf-8 +import unittest +import pandas as pd +from msprobe.pytorch.compare import acc_compare as compare + +npu_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', + 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], + 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), + ('torch.float32', [16])], + 'output_struct': [('torch.float32', [1, 16, 28, 28])], + 'summary': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], + [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], + [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], + [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} + +bench_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', + 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], + 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), + ('torch.float32', [16])], + 'output_struct': [('torch.float32', [1, 16, 28, 28])], + 'summary': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], + [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], + [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], + [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} + +tensor_list = [ + {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.33033010363578796, + 'Min': -0.331031858921051,'Mean': -0.030964046716690063, 'Norm': 2.2533628940582275, 'requires_grad': True, + 'full_op_name': 'Tensor.add_.0.forward_input.0'}, + {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward_input.1'}, + {'full_op_name': 'Tensor.add_.0.forward_input.alpha.0', 'dtype': "", "shape": '[]', 'md5': None, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1'}, + {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward_output.0'} +] + +result_op_dict = {'op_name': ['Tensor.add_.0.forward_input.0', 'Tensor.add_.0.forward_input.1', + 'Tensor.add_.0.forward_input.alpha.0', 'Tensor.add_.0.forward_output.0'], + 'input_struct': [('torch.float32', [16, 1, 3, 3]), ('torch.float32', [16, 1, 3, 3]), + ("", '[]')], + 'output_struct': [('torch.float32', [16, 1, 3, 3])], + 'summary': [[0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275], + [0.003992878366261721, -0.008102823048830032, -0.0002002553956117481, 0.02844562754034996], + [-0.1, -0.1, -0.1, -0.1], + [0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275]], + 'stack_info': []} + +o_result = [ + ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.0', 'torch.float32', 'torch.float32', + [1, 1, 28, 28], [1, 1, 28, 28], 0.0, 0.0, 0.0, ' ', '0.0%', '0.0%', '0.0%', ' ', 3.029174327850342, -2.926689624786377, + -0.06619918346405029, 3.029174327850342, -2.926689624786377, -0.06619918346405029, '', '', 'None'], + ['Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.1', 'torch.float32', 'torch.float32', + [16, 1, 5, 5], [16, 1, 5, 5], 0.0, 0.0, 0.0, ' ', '0.0%', '0.0%', '0.0%', ' ', 0.19919930398464203, -0.19974489510059357, + 0.006269412115216255, 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, '', '', 'None'], + ['Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_input.2', 'torch.float32', 'torch.float32', + [16], [16], 0.0, 0.0, 0.0, ' ', '0.0%', '0.0%', '0.0%', ' ', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, + 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, '', '', 'None'], + ['Functional_conv2d_0_forward_output', 'Functional_conv2d_0_forward_output', 'torch.float32', 'torch.float32', + [1, 16, 28, 28], [1, 16, 28, 28], 0.0, 0.0, 0.0, ' ', '0.0%', '0.0%', '0.0%', ' ', 2.1166646480560303, -2.190781354904175, + -0.003579073818400502, 2.1166646480560303, -2.190781354904175, -0.003579073818400502, '', '', 'None']] + +npu_dict_aten = {'op_name': ['Aten__native_batch_norm_legit_functional.default_0_forward_input.0', + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.1', + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.2', + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.3', + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.4', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.0', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.1', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.2', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.3', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.4'], + 'input_struct': [('torch.float16', [256, 256, 14, 14]), ('torch.float32', [256]), + ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256])], + 'output_struct': [('torch.float16', [256, 256, 14, 14]), ('torch.float32', [256]), + ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256])], + 'summary': [[139.625, -127.5625, -0.0103607177734375], + [2.5276029109954834, -2.1788690090179443, -0.0008259844034910202], + [2.472219944000244, -2.845968723297119, -0.008756577968597412], + [2.763145923614502, -3.398397922515869, -0.052132632583379745], + [2.673110008239746, -3.149275064468384, 0.01613386906683445], + [13.5546875, -10.640625, -0.008758544921875], + [0.30550330877304077, -0.24485322833061218, -0.010361209511756897], + [623.9192504882812, 432.96826171875, 520.2276611328125], + [2.4797861576080322, -3.055997371673584, -0.04795549064874649], + [61.7945556640625, 42.59713363647461, 52.03831481933594]]} + +bench_dict_functional = { + 'op_name': ['Functional_batch_norm_0_forward_input.0', 'Functional_batch_norm_0_forward_input.1', + 'Functional_batch_norm_0_forward_input.2', 'Functional_batch_norm_0_forward_input.3', + 'Functional_batch_norm_0_forward_input.4', 'Functional_batch_norm_0_forward_output'], + 'input_struct': [('torch.float32', [256, 256, 14, 14]), ('torch.float32', [256]), ('torch.float32', [256]), + ('torch.float32', [256]), ('torch.float32', [256])], + 'output_struct': [('torch.float32', [256, 256, 14, 14])], + 'summary': [[3.061628818511963, -3.22507381439209, 3.634914173744619e-05], + [0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06], + [0.9338104128837585, 0.9277191162109375, 0.930335283279419], + [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], + [5.397906303405762, -5.796811580657959, 2.5283952709287405e-10]] +} + +aten_result = [ + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', + 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 136.56337118148804, -124.33742618560791, + -0.010397066915174946, ' ', '4460.480981749501%', '3855.335826136584%', '28603.33536971545%', ' ', 139.625, + -127.5625, -0.0103607177734375, 3.061628818511963, -3.22507381439209, 3.634914173744619e-05, 'Warning', + 'Need double check api accuracy.', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.1', 'Functional_batch_norm_0_forward_input.1', + 'torch.float32', 'torch.float32', [256], [256], 2.527024927258026, -2.1782388387364335, -0.0008296193100250093, + ' ', '437213.84590749856%', '345658.76916858414%', '22823.676544842117%', ' ', 2.5276029109954834, + -2.1788690090179443, -0.0008259844034910202, 0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06, + 'Warning', 'Need double check api accuracy.', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.2', 'Functional_batch_norm_0_forward_input.2', + 'torch.float32', 'torch.float32', [256], [256], 1.5384095311164856, -3.7736878395080566, -0.9390918612480164, ' ', + '164.74538192025793%', '406.7705163736246%', '100.94122819224167%', ' ', 2.472219944000244, -2.845968723297119, + -0.008756577968597412, 0.9338104128837585, 0.9277191162109375, 0.930335283279419, 'Warning', + 'Need double check api accuracy.', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.3', 'Functional_batch_norm_0_forward_input.3', + 'torch.float32', 'torch.float32', [256], [256], 1.763145923614502, -4.398397922515869, -1.0521326325833797, ' ', + '176.3145923614502%', '439.8397922515869%', '105.21326325833797%', ' ', 2.763145923614502, -3.398397922515869, + -0.052132632583379745, 1.0, 1.0, 1.0, 'Warning', 'Need double check api accuracy.', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.4', 'Functional_batch_norm_0_forward_input.4', + 'torch.float32', 'torch.float32', [256], [256], 2.673110008239746, -3.149275064468384, 0.01613386906683445, ' ', + 'N/A', 'N/A', 'N/A', ' ', 2.673110008239746, -3.149275064468384, 0.01613386906683445, 0.0, 0.0, 0.0, 'Warning', + 'Need double check api accuracy.', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.0', 'Functional_batch_norm_0_forward_output', + 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 8.156781196594238, -4.843813419342041, + -0.008758545174714527, ' ', '151.11009228611078%', '83.55995967687207%', '3464072756.115108%', ' ', 13.5546875, + -10.640625, -0.008758544921875, 5.397906303405762, -5.796811580657959, 2.5283952709287405e-10, 'Warning', + 'Need double check api accuracy.', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.1', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 0.30550330877304077, -0.24485322833061218, -0.010361209511756897, 'Nan', 'Nan', 'Nan', + 'Yes', '', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.2', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 623.9192504882812, 432.96826171875, 520.2276611328125, 'Nan', 'Nan', 'Nan', + 'Yes', '', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.3', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 2.4797861576080322, -3.055997371673584, -0.04795549064874649, 'Nan', 'Nan', 'Nan', + 'Yes', '', 'None'], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.4', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 61.7945556640625, 42.59713363647461, 52.03831481933594, 'Nan', 'Nan', 'Nan', + 'Yes', '', 'None']] + +highlight_dict = {'red_rows': [], 'yellow_rows': []} + +num_0, num_1, num_2, num_3 = 0, 1, 2, 3 +summary_line_input = ['Functional_batch_norm_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', + 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.01, 0, 0, 0, 1, 1, 1, 1, 1.01, 1, 1, 1, + 'Yes', ''] +summary_line_1 = ['Functional_batch_norm_0_forward_output.0', 'Functional_batch_norm_0_forward_output.0', + 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 10, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 1, + 'Warning', ''] +summary_line_2 = ['Functional_batch_norm_0_forward_output.1', 'Functional_batch_norm_0_forward_output.1', + 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.02, 0, 0, 0, 0.12, 0, 1, 1, 0.1, 1, 1, 1, + 'Warning', ''] +summary_line_3 = ['Functional_batch_norm_0_forward_output.2', 'Functional_batch_norm_0_forward_output.2', + 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 1, + 'Warning', ''] +line_input = ['Functional_batch_norm_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 1, 1, 1, 0.95, 1, 1, 1, 1, 1, 1.01, 1, 1, 1, + 'Yes', ''] +line_1 = ['Functional_batch_norm_0_forward_output.0', 'Functional_batch_norm_0_forward_output.0', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 1, 1, 0.59, 1, 'nan', 0, 1, 1, 19, 1, 1, 1, + 'Warning', ''] +line_2 = ['Functional_batch_norm_0_forward_output.1', 'Functional_batch_norm_0_forward_output.1', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.9, 1, 1, 0.8, 1, 0, 0.12, 0, 1, 1, 0.1, 1, 1, 1, + 'Warning', ''] +line_3 = ['Functional_batch_norm_0_forward_output.2', 'Functional_batch_norm_0_forward_output.2', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 1.1e+10, 1, 0.85, 1, 9, 0.12, 0, 1, 1, 0.1, 1, + 1, 1, 'Warning', ''] + +op_data = { + 'input_args': [{'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.33033010363578796, 'Min': -0.331031858921051,'Mean': -0.030964046716690063, + 'Norm': 2.2533628940582275, 'requires_grad': True}, + {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, + 'Norm': 0.02844562754034996, 'requires_grad': False}], + 'input_kwargs': {'alpha': {'type': 'float', 'value': -0.1}}, + 'output': [{'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.33033010363578796, 'Min': -0.331031858921051,'Mean': -0.030964046716690063, + 'Norm': 2.2533628940582275, 'requires_grad': True}]} + +op_name = "Tensor.add_0.0.forward" + +op_result = [ + {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward_input.0'}, + {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.forward_input.1'}, + {'full_op_name': 'Tensor.add_0.0.forward_input.alpha.0', 'dtype': "", 'shape': '[]', 'md5': None, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1'}, + {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], + 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward_output.0'}] + + +class TestUtilsMethods(unittest.TestCase): + + def test_check_graph_mode(self): + op1 = "Aten" + op2 = "torch" + self.assertTrue(compare.check_graph_mode(op1, op2)) + self.assertTrue(compare.check_graph_mode(op2, op1)) + self.assertFalse(compare.check_graph_mode(op1, op1)) + self.assertFalse(compare.check_graph_mode(op2, op2)) + + def test_check_op(self): + fuzzy_match = False + result = compare.check_op(npu_dict, bench_dict, fuzzy_match) + self.assertEqual(result, True) + + def test_merge_tensor(self): + op_dict = compare.merge_tensor(tensor_list, True, False) + self.assertEqual(op_dict, result_op_dict) + + def test_read_op(self): + result = compare.read_op(op_data, op_name) + self.assertEqual(result, op_result) + + def test_match_op(self): + fuzzy_match = False + a, b = compare.match_op([npu_dict], [bench_dict], fuzzy_match) + self.assertEqual(a, 0) + self.assertEqual(b, 0) + + def test_get_accuracy(self): + result = [] + compare.get_accuracy(result, npu_dict, bench_dict, highlight_dict) + self.assertEqual(result, o_result) + + def test_get_accuracy_graph_mode(self): + result = [] + compare.get_accuracy(result, npu_dict_aten, bench_dict_functional, highlight_dict) + self.assertEqual(result, aten_result) + + def test_find_error_rows(self): + summary_result = [summary_line_input, summary_line_1, summary_line_2, summary_line_3] + highlight_dict = {'red_rows': [], 'yellow_rows': []} + compare.find_error_rows(summary_result, 0, 1, highlight_dict, summary_compare=True) + self.assertEqual(highlight_dict, {'red_rows': [], 'yellow_rows': []}) + + def test_find_compare_result_error_rows(self): + result = [line_input, line_1, line_2, line_3] + result_df = pd.DataFrame(result) + highlight_dict = {'red_rows': [], 'yellow_rows': []} + compare.find_compare_result_error_rows(result_df, highlight_dict, False, False) + self.assertEqual(highlight_dict, {'red_rows': [num_1, num_3], 'yellow_rows': [num_2]}) + + def test_rename_api(self): + test_name_1 = "Distributed.broadcast.0.forward.input.0" + expect_name_1 = "Distributed.broadcast.input.0" + actual_name_1 = compare.rename_api(test_name_1, "forward") + self.assertEqual(actual_name_1, expect_name_1) + + test_name_2 = "Torch.sum.0.backward.output.0" + expect_name_2 = "Torch.sum.output.0" + actual_name_2 = compare.rename_api(test_name_2, "backward") + self.assertEqual(actual_name_2, expect_name_2) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_match.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_match.py new file mode 100644 index 00000000000..ac28e994e9c --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_match.py @@ -0,0 +1,20 @@ +# coding=utf-8 +import unittest +from msprobe.pytorch.compare import match + + +class TestMatch(unittest.TestCase): + def test_graph_mapping(self): + op1 = "Aten_convolution_1_forward_0.input.0" + op2 = "Torch_conv2d_0_forward_0.input.0" + op3 = "Torch_batch_norm_0_forward_0.input.0" + op4 = "Aten_convolution.default_1_forward_0.input.0" + op5 = "Aten_foo_1_forward_0.input.0" + self.assertTrue(match.graph_mapping.match(op1, op2)) + self.assertTrue(match.graph_mapping.match(op2, op1)) + self.assertTrue(match.graph_mapping.match(op4, op2)) + self.assertTrue(match.graph_mapping.match(op2, op4)) + self.assertFalse(match.graph_mapping.match(op1, op3)) + self.assertFalse(match.graph_mapping.match(op3, op1)) + self.assertFalse(match.graph_mapping.match(op5, op2)) + self.assertFalse(match.graph_mapping.match(op2, op5)) diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py similarity index 94% rename from debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py index 828d646c52f..ad9eb5cd0ed 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py @@ -1,10 +1,10 @@ from unittest import TestCase import torch -from atat.core.common.const import Const -from atat.pytorch.free_benchmark.common.enums import DeviceType, PerturbationMode -from atat.pytorch.free_benchmark.common.params import data_pre_deal -from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark.common.enums import DeviceType, PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import data_pre_deal +from msprobe.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory class TestPerturbedLayer(TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py similarity index 91% rename from debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py index d46e26e0948..399efeb42d7 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py @@ -2,17 +2,17 @@ from abc import ABC from unittest import TestCase import torch -from atat.core.common.const import Const -from atat.pytorch.free_benchmark.common.constant import PreheatConfig, ThresholdConfig -from atat.pytorch.free_benchmark.common.counter import preheat_counter -from atat.pytorch.free_benchmark.common.enums import ( +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark.common.constant import PreheatConfig, ThresholdConfig +from msprobe.pytorch.free_benchmark.common.counter import preheat_counter +from msprobe.pytorch.free_benchmark.common.enums import ( DeviceType, FuzzLevel, HandlerType, PerturbationMode, ) -from atat.pytorch.free_benchmark.common.params import DataParams, make_handler_params -from atat.pytorch.free_benchmark.result_handlers.handler_factory import ( +from msprobe.pytorch.free_benchmark.common.params import DataParams, make_handler_params +from msprobe.pytorch.free_benchmark.result_handlers.handler_factory import ( FuzzHandlerFactory, ) diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/test_main.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/test_main.py similarity index 92% rename from debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/test_main.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/test_main.py index d326e993c07..4498a2af705 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/free_benchmark/test_main.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/free_benchmark/test_main.py @@ -4,10 +4,10 @@ from unittest import TestCase import torch import torch.nn as nn -from atat.core.common.const import Const -from atat.pytorch.free_benchmark import FreeBenchmarkCheck -from atat.pytorch.free_benchmark.common.constant import CommonField, PreheatConfig -from atat.pytorch.free_benchmark.common.enums import ( +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck +from msprobe.pytorch.free_benchmark.common.constant import CommonField, PreheatConfig +from msprobe.pytorch.free_benchmark.common.enums import ( DeviceType, FuzzLevel, HandlerType, diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/functional/test_dump_module.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/functional/test_dump_module.py new file mode 100644 index 00000000000..d67adf2f912 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/functional/test_dump_module.py @@ -0,0 +1,15 @@ +import unittest + +import torch.nn as nn +from msprobe.pytorch import PrecisionDebugger +from msprobe.pytorch.functional.dump_module import module_dump, module_count + + +class TestDumpModule(unittest.TestCase): + def setUp(self): + self.module = nn.Linear(in_features=8, out_features=4) + + def test_module_dump(self): + PrecisionDebugger(dump_path="./dump") + module_dump(self.module, "TestModule") + self.assertTrue("TestModule" in module_count) diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_api_registry.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_api_registry.py similarity index 91% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_api_registry.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_api_registry.py index c80e5dbed45..837ad23df76 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_api_registry.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_api_registry.py @@ -1,5 +1,5 @@ import unittest -from atat.pytorch.hook_module.api_registry import ApiRegistry, torch_version_above_2, is_gpu +from msprobe.pytorch.hook_module.api_registry import ApiRegistry, torch_version_above_2, is_gpu class TestApiRegistry(unittest.TestCase): @@ -43,7 +43,7 @@ class TestApiRegistry(unittest.TestCase): import torch import torch.distributed as dist #import torch_npu #门禁没有安装torch_npu - from atat.pytorch.hook_module.api_registry import torch_without_guard_version, npu_distributed_api, is_gpu, torch_version_above_2 + from msprobe.pytorch.hook_module.api_registry import torch_without_guard_version, npu_distributed_api, is_gpu, torch_version_above_2 @@ -79,7 +79,7 @@ class TestApiRegistry(unittest.TestCase): import torch import torch.distributed as dist #import torch_npu #门禁没有安装torch_npu - from atat.pytorch.hook_module.api_registry import torch_without_guard_version, npu_distributed_api, is_gpu, torch_version_above_2 + from msprobe.pytorch.hook_module.api_registry import torch_without_guard_version, npu_distributed_api, is_gpu, torch_version_above_2 diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_hook_module.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_hook_module.py similarity index 94% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_hook_module.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_hook_module.py index 646f6415226..50783e5d736 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_hook_module.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_hook_module.py @@ -1,7 +1,7 @@ import unittest from unittest.mock import patch, Mock -from atat.pytorch.hook_module.hook_module import HOOKModule +from msprobe.pytorch.hook_module.hook_module import HOOKModule class TestHookModule(unittest.TestCase): def test_call_1(self): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_aten.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py similarity index 96% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_aten.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py index 92aee790ddd..4940b07cb0d 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_aten.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py @@ -1,6 +1,6 @@ import unittest import torch -from atat.pytorch.hook_module.wrap_aten import AtenOPTemplate, AtenOPPacketTemplate +from msprobe.pytorch.hook_module.wrap_aten import AtenOPTemplate, AtenOPPacketTemplate def hook(name): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_distributed.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py similarity index 95% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_distributed.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py index bd0501ef2fd..9a375e45bfc 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_distributed.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py @@ -1,6 +1,6 @@ import unittest import torch.distributed as dist -from atat.pytorch.hook_module.wrap_distributed import * +from msprobe.pytorch.hook_module.wrap_distributed import * class TestWrapDistributed(unittest.TestCase): def hook(name, prefix): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_functional.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py similarity index 91% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_functional.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py index 232117498b5..f43b8ea6cb9 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_functional.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py @@ -1,6 +1,6 @@ import unittest import torch -from atat.pytorch.hook_module import wrap_functional as wf +from msprobe.pytorch.hook_module import wrap_functional as wf class TestWrapFunctional(unittest.TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_tensor.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py similarity index 88% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_tensor.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py index e027270540e..61f76b0ca0a 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_tensor.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py @@ -1,7 +1,7 @@ import unittest import torch import yaml -from atat.pytorch.hook_module.wrap_tensor import get_tensor_ops, HOOKTensor, TensorOPTemplate, wrap_tensor_op, wrap_tensor_ops_and_bind +from msprobe.pytorch.hook_module.wrap_tensor import get_tensor_ops, HOOKTensor, TensorOPTemplate, wrap_tensor_op, wrap_tensor_ops_and_bind class TestWrapTensor(unittest.TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_torch.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py similarity index 96% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_torch.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py index 8817bc758ae..e1a3e77983d 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_torch.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py @@ -1,7 +1,7 @@ import unittest import torch import yaml -from atat.pytorch.hook_module.wrap_torch import * +from msprobe.pytorch.hook_module.wrap_torch import * class TestWrapTorch(unittest.TestCase): diff --git a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_vf.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py similarity index 82% rename from debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_vf.py rename to debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py index 8d57fad6eb6..98efb4bc5b8 100644 --- a/debug/accuracy_tools/atat/test/pytorch_ut/hook_module/test_wrap_vf.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py @@ -1,6 +1,6 @@ import unittest import torch -from atat.pytorch.hook_module import wrap_vf +from msprobe.pytorch.hook_module import wrap_vf class TestWrapVF(unittest.TestCase): def setUp(self): diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/test_pt_config.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/test_pt_config.py new file mode 100644 index 00000000000..470390d77b2 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/test_pt_config.py @@ -0,0 +1,84 @@ +from unittest import TestCase +from unittest.mock import patch, mock_open + +from msprobe.core.common.const import Const +from msprobe.pytorch.pt_config import parse_json_config, parse_task_config + + +class TestPtConfig(TestCase): + def test_parse_json_config(self): + mock_json_data = { + "task": "statistics", + "dump_path": "./dump/", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "statistics": { + "scope": [], + "list": [], + "data_mode": ["all"], + }, + "tensor": { + "file_format": "npy" + } + } + with patch("msprobe.pytorch.pt_config.os.path.join", return_value="/path/config.json"), \ + patch("msprobe.pytorch.pt_config.FileOpen", mock_open(read_data='')), \ + patch("msprobe.pytorch.pt_config.json.load", return_value=mock_json_data): + common_config, task_config = parse_json_config(None, None) + self.assertEqual(common_config.task, Const.STATISTICS) + self.assertEqual(task_config.data_mode, ["all"]) + + with patch("msprobe.pytorch.pt_config.os.path.join", return_value="/path/config.json"), \ + patch("msprobe.pytorch.pt_config.FileOpen", mock_open(read_data='')), \ + patch("msprobe.pytorch.pt_config.json.load", return_value=mock_json_data): + common_config, task_config = parse_json_config(None, Const.TENSOR) + self.assertEqual(common_config.task, Const.STATISTICS) + self.assertEqual(task_config.file_format, "npy") + + def test_parse_task_config(self): + overflow_check_config = { + "overflow_check": { + "overflow_nums": 1, + "check_mode": "all" + } + } + result = parse_task_config(Const.OVERFLOW_CHECK, overflow_check_config) + self.assertEqual(result.overflow_nums, 1) + self.assertEqual(result.check_mode, "all") + + free_benchmark_config = { + "free_benchmark": { + "scope": [], + "list": ["conv2d"], + "fuzz_device": "npu", + "pert_mode": "improve_precision", + "handler_type": "check", + "fuzz_level": "L1", + "fuzz_stage": "forward", + "if_preheat": False, + "preheat_step": 15, + "max_sample": 20 + } + } + result = parse_task_config(Const.FREE_BENCHMARK, free_benchmark_config) + self.assertEqual(result.pert_mode, "improve_precision") + self.assertEqual(result.handler_type, "check") + self.assertEqual(result.preheat_step, 15) + self.assertEqual(result.max_sample, 20) + + run_ut_config = { + "run_ut": { + "white_list": ["conv2d"], + "black_list": ["matmul"], + "error_data_path": '/home/dump_path' + + } + } + with patch('os.path.exists', return_value=True) as mocked_exists: + result = parse_task_config(Const.RUN_UT, run_ut_config) + self.assertEqual(result.white_list, ["conv2d"]) + self.assertEqual(result.black_list, ["matmul"]) + self.assertEqual(result.error_data_path, '/home/dump_path') + mocked_exists.assert_called_once_with('/home/dump_path') diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/test_service.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/test_service.py new file mode 100644 index 00000000000..c09b6abcb69 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/test_service.py @@ -0,0 +1,59 @@ +import unittest +from unittest.mock import patch, mock_open + +import torch.nn as nn +from msprobe.core.common.utils import Const +from msprobe.pytorch.debugger.debugger_config import DebuggerConfig +from msprobe.pytorch.pt_config import parse_json_config +from msprobe.pytorch.service import Service + + +class TestService(unittest.TestCase): + def setUp(self): + mock_json_data = { + "dump_path": "./dump/", + } + with patch("msprobe.pytorch.pt_config.FileOpen", mock_open(read_data='')), \ + patch("msprobe.pytorch.pt_config.json.load", return_value=mock_json_data): + common_config, task_config = parse_json_config("./config.json", Const.STATISTICS) + self.config = DebuggerConfig(common_config, task_config, Const.STATISTICS, "./ut_dump", "L1") + self.service = Service(self.config) + + def test_start(self): + with patch("msprobe.pytorch.service.get_rank_if_initialized", return_value=0), \ + patch("msprobe.pytorch.service.Service.create_dirs", return_value=None): + self.service.start(None) + self.assertEqual(self.service.current_rank, 0) + + def test_stop_and_step(self): + with patch("msprobe.core.data_dump.data_collector.DataCollector.write_json", return_value=None): + self.service.stop() + self.assertFalse(self.service.switch) + + self.service.step() + self.assertEqual(self.service.current_iter, 1) + + def test_register_hook_new(self): + class TestModule(nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(in_features=8, out_features=4) + + def forward(self, x): + x = self.linear(x) + return x + + self.service.model = TestModule() + self.config.level = "L0" + with patch("msprobe.pytorch.service.logger.info_on_rank_0") as mock_logger, \ + patch("msprobe.pytorch.service.remove_dropout", return_value=None): + self.service.register_hook_new() + self.assertEqual(mock_logger.call_count, 2) + + def test_create_dirs(self): + with patch("msprobe.pytorch.service.Path.mkdir", return_value=None), \ + patch("msprobe.core.common.file_check.FileChecker.common_check", return_value=None), \ + patch("msprobe.core.data_dump.data_collector.DataCollector.update_dump_paths", + return_value=None): + self.service.create_dirs() + self.assertEqual(self.service.dump_iter_dir, "./ut_dump/step0") diff --git a/debug/accuracy_tools/atat/test/resources/advisor.txt b/debug/accuracy_tools/msprobe/test/resources/advisor.txt similarity index 100% rename from debug/accuracy_tools/atat/test/resources/advisor.txt rename to debug/accuracy_tools/msprobe/test/resources/advisor.txt diff --git a/debug/accuracy_tools/atat/test/resources/compare_result_20230703104808.csv b/debug/accuracy_tools/msprobe/test/resources/compare_result_20230703104808.csv similarity index 100% rename from debug/accuracy_tools/atat/test/resources/compare_result_20230703104808.csv rename to debug/accuracy_tools/msprobe/test/resources/compare_result_20230703104808.csv diff --git a/debug/accuracy_tools/atat/test/resources/compare_result_without_accuracy.csv b/debug/accuracy_tools/msprobe/test/resources/compare_result_without_accuracy.csv similarity index 100% rename from debug/accuracy_tools/atat/test/resources/compare_result_without_accuracy.csv rename to debug/accuracy_tools/msprobe/test/resources/compare_result_without_accuracy.csv diff --git a/debug/accuracy_tools/atat/test/resources/config.yaml b/debug/accuracy_tools/msprobe/test/resources/config.yaml similarity index 100% rename from debug/accuracy_tools/atat/test/resources/config.yaml rename to debug/accuracy_tools/msprobe/test/resources/config.yaml diff --git a/debug/accuracy_tools/atat/test/resources/npu_test.pkl b/debug/accuracy_tools/msprobe/test/resources/npu_test.pkl similarity index 100% rename from debug/accuracy_tools/atat/test/resources/npu_test.pkl rename to debug/accuracy_tools/msprobe/test/resources/npu_test.pkl diff --git a/debug/accuracy_tools/atat/test/run_test.sh b/debug/accuracy_tools/msprobe/test/run_test.sh similarity index 100% rename from debug/accuracy_tools/atat/test/run_test.sh rename to debug/accuracy_tools/msprobe/test/run_test.sh diff --git a/debug/accuracy_tools/atat/test/run_ut.py b/debug/accuracy_tools/msprobe/test/run_ut.py similarity index 97% rename from debug/accuracy_tools/atat/test/run_ut.py rename to debug/accuracy_tools/msprobe/test/run_ut.py index 7c593c14abc..8ea81ccca71 100644 --- a/debug/accuracy_tools/atat/test/run_ut.py +++ b/debug/accuracy_tools/msprobe/test/run_ut.py @@ -3,7 +3,7 @@ import shutil import subprocess import sys -from atat.core.common.log import logger +from msprobe.core.common.log import logger def run_ut(): diff --git a/debug/accuracy_tools/atat/test/test_module_processer.py b/debug/accuracy_tools/msprobe/test/test_module_processer.py similarity index 95% rename from debug/accuracy_tools/atat/test/test_module_processer.py rename to debug/accuracy_tools/msprobe/test/test_module_processer.py index 89ee299f66f..448c35f0554 100644 --- a/debug/accuracy_tools/atat/test/test_module_processer.py +++ b/debug/accuracy_tools/msprobe/test/test_module_processer.py @@ -1,6 +1,6 @@ import unittest -from atat.pytorch.module_processer import ModuleProcesser -from atat.pytorch.common.utils import Const +from msprobe.pytorch.module_processer import ModuleProcesser +from msprobe.pytorch.common.utils import Const import torch -- Gitee From 24a8fc305a1e55c485dd1f937086bac45892f3e1 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Tue, 6 Aug 2024 20:35:25 +0800 Subject: [PATCH 30/94] =?UTF-8?q?=E5=B0=86=E5=88=86=E7=BA=A7=E5=8F=AF?= =?UTF-8?q?=E8=A7=86=E5=8C=96=E4=BB=A3=E7=A0=81=E6=90=AC=E8=87=B3poc?= =?UTF-8?q?=E5=88=86=E6=94=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/pytorch/compare/acc_compare.py | 3 +- .../msprobe/pytorch/visualization/__init__.py | 0 .../pytorch/visualization/builder/__init__.py | 0 .../visualization/builder/graph_builder.py | 84 +++++++ .../visualization/builder/msprobe_adapter.py | 186 +++++++++++++++ .../pytorch/visualization/compare/__init__.py | 0 .../visualization/compare/graph_comparator.py | 106 +++++++++ .../visualization/compare/mode_adapter.py | 211 ++++++++++++++++++ .../pytorch/visualization/graph/__init__.py | 0 .../pytorch/visualization/graph/base_node.py | 107 +++++++++ .../pytorch/visualization/graph/graph.py | 86 +++++++ .../pytorch/visualization/graph/node_op.py | 37 +++ .../msprobe/pytorch/visualization/test.py | 85 +++++++ .../msprobe/pytorch/visualization/utils.py | 122 ++++++++++ 14 files changed, 1026 insertions(+), 1 deletion(-) create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/__init__.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/builder/__init__.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/compare/__init__.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/graph/__init__.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/test.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/visualization/utils.py diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py index 2a68c756ed3..27d555f5d27 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/acc_compare.py @@ -969,8 +969,9 @@ def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False if npu_ops_queue: for npu_data in npu_ops_queue: get_un_match_accuracy(result, npu_data, md5_compare, summary_compare) + result_to_csv(md5_compare, summary_compare, stack_mode, result) - header = [] +def result_to_csv(md5_compare, summary_compare, stack_mode, result): if md5_compare: header = CompareConst.MD5_COMPARE_RESULT_HEADER[:] elif summary_compare: diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py new file mode 100644 index 00000000000..f623a48ae3b --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py @@ -0,0 +1,84 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..graph.graph import Graph +from ..graph.node_op import NodeOp +from ..utils import load_json_file, load_data_json_file, save_json_file, GraphConst +from .msprobe_adapter import get_input_output + + +class GraphBuilder: + @staticmethod + def build(construct_path, data_path, model_name='DefaultModel'): + """ + GraphBuilder的对外提供的构图方法 + Args: + construct_path: construct.json路径 + data_path: dump.json路径 + model_name: 模型名字,依赖外部输入 + Returns: Graph,代表图的数据结构 + """ + construct_dict = load_json_file(construct_path) + data_dict = load_data_json_file(data_path) + graph = Graph(model_name) + GraphBuilder._init_nodes(graph, construct_dict, data_dict) + return graph + + @staticmethod + def to_json(filename, graph_n, graph_b=None, tool_tip=None): + """ + 将graph导出成.vis文件的接口 + Args: + filename: 输出文件路径 + graph_n: Graph + graph_b: bench Graph,为空是只输出graph_b,不为空会同时输出两个graph,作为对比的结果 + tool_tip: 在对比模型下输出的意见 + """ + result = {} + if graph_b: + result[GraphConst.JSON_NPU_KEY] = graph_n.to_dict() + result[GraphConst.JSON_BENCH_KEY] = graph_b.to_dict() + else: + result = graph_n.to_dict() + if tool_tip: + result[GraphConst.JSON_TIP_KEY] = tool_tip + save_json_file(filename, result) + + @staticmethod + def _init_nodes(graph, construct_dict, data_dict): + for subnode_id, upnode_id in construct_dict.items(): + if upnode_id: + upnode_op = NodeOp.get_node_op(upnode_id) + upnode = GraphBuilder._create_or_get_node(graph, data_dict, upnode_op, upnode_id) + else: + upnode = graph.root + node_op = NodeOp.get_node_op(subnode_id) + GraphBuilder._create_or_get_node(graph, data_dict, node_op, subnode_id, upnode) + + @staticmethod + def _create_or_get_node(graph, data_dict, op, name, upnode=None): + if name in graph.node_map: + node = graph.get_node(name) + else: + graph.add_node(op, name, upnode) + node = graph.get_node(name) + node_data = data_dict.get(name, {}) + # 添加输入输出数据 + input_data, output_data = get_input_output(node_data, node.id) + # 更新数据 + node.set_input_output(input_data, output_data) + # 添加节点 + node.add_upnode(upnode) + return node \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py new file mode 100644 index 00000000000..b21963a5976 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py @@ -0,0 +1,186 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from ...compare.acc_compare import read_op, merge_tensor, get_accuracy, _do_multi_process +from ....core.common.utils import task_dumppath_get +from ..utils import GraphConst + + +# 用于将节点名字解析成对应的NodeOp的规则 +op_patterns = [ + r'^(Module)', #NodeOp.module + r'^(Tensor|Torch|Functional|NPU|VF|Distributed|Aten)' #NodeOp.function_api +] + + +def get_compare_mode(dump_path_param): + """ + 获得比较模式,包括summary、MD5和真实数据三种模式 + Args: + dump_path_param: 调用acc_compare接口所依赖的参数 + Returns: 0 summary mode, 1 md5 mode, 2 true data mode + """ + summary_compare, md5_compare = task_dumppath_get(dump_path_param) + if summary_compare: + compare_mode = GraphConst.SUMMARY_COMPARE + elif md5_compare: + compare_mode = GraphConst.MD5_COMPARE + else: + compare_mode = GraphConst.REAL_DATA_COMPARE + return compare_mode + + +def run_real_data(dump_path_param, csv_path): + """ + 多进程运行生成真实数据 + Args: + dump_path_param: 调用acc_compare接口所依赖的参数 + csv_path: 生成文件路径 + """ + return _do_multi_process(dump_path_param, csv_path) + + +def get_input_output(node_data, node_id): + """ + 将dump的原始数据进行拆解,分解为output和input两个数据 + Args: + node_data: 属于单个节点的dump数据 + node_id: 节点名字 + """ + input_data = {} + output_data = {} + op_parsed_list = read_op(node_data, node_id) + for item in op_parsed_list: + full_op_name = item.get('full_op_name', '') + if not full_op_name: + continue + splits = full_op_name.split('.') + if len(splits) < GraphConst.OUTPUT_MIN_LEN: + continue + if GraphConst.OUTPUT in splits[GraphConst.OUTPUT_INDEX_TWO] and \ + GraphConst.INPUT not in splits[GraphConst.OUTPUT_INDEX_THREE]: + output_data[full_op_name] = item + else: + input_data[full_op_name] = item + return input_data, output_data + + +def compare_data(data_dict_list1, data_dict_list2): + """ + 比较get_input_output中输出的结果是否结构一致,比较一致返回True + """ + if len(data_dict_list1) != len(data_dict_list2): + return False + # 用于比较两个节点是否相等的关键字段 + tag_keys = ['type', 'dtype', 'shape'] + for key1, key2 in zip(data_dict_list1, data_dict_list2): + dict1 = data_dict_list1[key1] + dict2 = data_dict_list2[key2] + for tag_key in tag_keys: + tag_value1 = dict1.get(tag_key, None) + tag_value2 = dict2.get(tag_key, None) + if tag_value1 != tag_value2: + return False + return True + + +def format_node_data(data_dict): + """ + 批量进行节点数据的输出 + """ + del_list = ['requires_grad', 'data_name', 'full_op_name'] + for _, value in data_dict.items(): + if not isinstance(value, dict): + continue + for item in del_list: + if item in value: + del value[item] + _format_data(value) + return data_dict + + +def compare_node(node_ids, data_dicts, stack_json_data, is_summary_compare, is_md5_compare): + """ + 调用acc_compare.py中的get_accuracy获得精度对比指标 + 真实数据对比模式无法获得精度对比指标,需要调用多进程比对接口 + Returns: 包含参数信息和对比指标(真实数据对比模式除外)的list + """ + merge_n = _parse_node(node_ids[0], data_dicts[0], stack_json_data, is_summary_compare, is_md5_compare) + merge_b = _parse_node(node_ids[1], data_dicts[1], stack_json_data, is_summary_compare, is_md5_compare) + result = [] + get_accuracy(result, merge_n, merge_b, is_summary_compare, is_md5_compare) + return result + + +def _parse_node(node_id, data_dict, stack_json_data, is_summary_compare, is_md5_compare): + """ + 转换节点,使其能够作为acc_compare.py中的get_accuracy的入参 + """ + op_parsed_list = read_op(data_dict.get(node_id, {}), node_id) + if node_id in stack_json_data: + op_parsed_list.append( + {'full_op_name': node_id, 'full_info': stack_json_data[node_id]}) + else: + op_parsed_list.append({'full_op_name': node_id, 'full_info': None}) + result = merge_tensor(op_parsed_list, is_summary_compare, is_md5_compare) + if not result: + result['op_name'] = [] + return result + + +def _format_decimal_string(s): + """ + 使用正则表达式匹配包含数字、小数点和可选的百分号的字符串 + """ + pattern = re.compile(r'\d{1,20}\.\d{1,20}%?') + matches = pattern.findall(s) + for match in matches: + is_percent = match.endswith('%') + number_str = match.rstrip('%') + decimal_part = number_str.split('.')[1] + # 如果小数位数大于6,进行处理 + if len(decimal_part) > GraphConst.ROUND_TH: + number_float = float(number_str) + formatted_number = f"{number_float:.{GraphConst.ROUND_TH}f}" + # 如果原来是百分数,加回百分号 + if is_percent: + formatted_number += '%' + # 替换原字符串中的数值部分 + s = s.replace(match, formatted_number) + return s + + +def _format_data(data_dict): + """ + 格式化数据,小数保留6位,处理一些异常值 + """ + pattern = r'^[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)$' + for key, value in data_dict.items(): + if isinstance(value, str): + # 将单引号删掉,None换成null避免前端解析错误 + value = value.replace("'", "").replace('None', 'null') + value = _format_decimal_string(value) + elif value is None or value == ' ': + value = 'null' + # 科学计数法1.123123123123e-11,格式化为1.123123e-11 + elif isinstance(value, float) and len(str(value)) < GraphConst.STR_MAX_LEN and re.match(pattern, str(value)): + value = "{:.6e}".format(value) + elif isinstance(value, float): + value = round(value, GraphConst.ROUND_TH) + # Inf会走入这里,确保转成Inf。另外给其他不符合预期的类型做兜底方案 + if not isinstance(value, (list, tuple, dict, str)): + value = str(value) + data_dict[key] = value diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py new file mode 100644 index 00000000000..9346f9e9bb9 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py @@ -0,0 +1,106 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data +from ..utils import GraphConst, load_json_file, load_data_json_file, get_csv_df +from ..graph.graph import Graph +from .mode_adapter import ModeAdapter + + +class GraphComparator: + def __init__(self, graphs, data_paths, stack_path, output_path): + self.graph_n = graphs[0] + self.graph_b = graphs[1] + self._parse_param(data_paths, stack_path, output_path) + + def compare(self): + """ + 比较函数,初始化结束后单独调用。比较结果写入graph_n + """ + self._compare_nodes(self.graph_n.root) + self._postcompare() + + def add_compare_result_to_node(self, node, compare_result_list): + """ + 将比对结果添加到节点的输入输出数据中 + Args: + node: 节点 + compare_result_list: 包含参数信息和对比指标(真实数据对比模式除外)的list + """ + # 真实数据比对,先暂存节点,在多进程对比得到精度指标后,再将指标添加到节点中 + if self.ma.prepare_real_data(node): + return + compare_in_dict = {} + compare_out_dict = {} + # input和output对比数据分开 + for item in compare_result_list: + if 'output' in item[0]: + compare_out_dict[item[0]] = item + else: + compare_in_dict[item[0]] = item + precision_status, precision_index, other_dict = ( + self.ma.parse_result(node, [compare_in_dict, compare_out_dict])) + node.data[GraphConst.JSON_STATUS_KEY] = precision_status + node.data[GraphConst.JSON_INDEX_KEY] = precision_index + node.data.update(other_dict) + if not precision_status: + self.ma.add_error_key(node.output_data) + node.get_suggestions() + + def _parse_param(self, data_paths, stack_path, output_path): + self.dump_path_param = { + 'npu_json_path': data_paths[0], + 'bench_json_path': data_paths[1], + 'stack_json_path': stack_path, + 'is_print_compare_log': True + } + self.output_path = output_path + compare_mode = get_compare_mode(self.dump_path_param) + self.ma = ModeAdapter(compare_mode) + self.data_n_dict = load_data_json_file(data_paths[0]) + self.data_b_dict = load_data_json_file(data_paths[1]) + self.stack_json_data = load_json_file(stack_path) + + def _postcompare(self): + if not self.ma.is_real_data_compare(): + return + df = get_csv_df(self.ma.is_md5_compare(), self.ma.is_summary_compare(), True, self.ma.csv_data) + df = run_real_data(self.dump_path_param, df) + compare_data_dict = {row[0]: row.tolist() for _, row in df.iterrows()} + for node in self.ma.compare_nodes: + precision_status, precision_index, _ = self.ma.parse_result(node, [compare_data_dict]) + node.data[GraphConst.JSON_STATUS_KEY] = precision_status + node.data[GraphConst.JSON_INDEX_KEY] = precision_index + if not precision_status: + self.ma.add_error_key(node.output_data) + node.get_suggestions() + + def _compare_nodes(self, node_n): + #递归遍历NPU树中的节点,如果在Bench中找到具有相同名称的节点,检查他们的祖先和参数信息,检查一致则及逆行精度数据对比 + #这里采用先序遍历,好处在于当这个节点被比较时,他的先序已经被匹配,这可以为后续的模糊匹配提供重要信息 + node_b, ancestors = Graph.match(self.graph_n, node_n, self.graph_b) + if node_b: + ancestors.append(node_b.id) + node_n.add_link(node_b, ancestors) + # 真实数据比对只会得到基本信息,并没有精度指标,需要调用多进程对比接口 + compare_result_list = compare_node([node_n.id, node_b.id], + [self.data_n_dict, self.data_b_dict], + self.stack_json_data, self.ma.is_summary_compare(), + self.ma.is_md5_compare()) + if compare_result_list: + self.ma.add_csv_data(compare_result_list) + self.add_compare_result_to_node(node_n, compare_result_list) + for subnode in node_n.subnodes: + self._compare_nodes(subnode) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py new file mode 100644 index 00000000000..d58f2078b6f --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py @@ -0,0 +1,211 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from ....core.common.const import CompareConst, Const +from ..utils import ToolTip, GraphConst, str2float + + +class ModeAdapter: + def __init__(self, compare_mode): + self.compare_mode = compare_mode + self.csv_data = [] + self.compare_nodes = [] + + @staticmethod + def _add_md5_compare_data(node_data, compare_data_dict): + precision_status = True + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = [GraphConst.JSON_MD5_KEY] + headers = CompareConst.MD5_COMPARE_RESULT_HEADER + id_list = [headers.index(x) for x in key_list] + ModeAdapter._match_data(value, compare_data, key_list, id_list) + # md5比对是否通过 + if value.get(GraphConst.JSON_MD5_KEY) != CompareConst.PASS: + precision_status = False + node_data[key] = value + return precision_status + + @staticmethod + def _add_real_compare_data(node_data, compare_data_dict): + min_thousandth = float(1) + numbers = [] + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, + CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] + headers = CompareConst.COMPARE_RESULT_HEADER + id_list = [headers.index(x) for x in key_list] + ModeAdapter._match_data(value, compare_data, key_list, id_list) + # 获取一个节点所有的输入或输出最小的双千指标 + thousandth = value.get(CompareConst.ONE_THOUSANDTH_ERR_RATIO) + # 可能是None,可能是非数字内容str + try: + thousandth = float(thousandth) + except (ValueError, TypeError): + thousandth = None + if thousandth is not None: + numbers.append(thousandth) + node_data[key] = value + # 双千指标都是None的异常情况 + if not numbers: + min_thousandth = None + else: + min_thousandth = min(numbers + [min_thousandth]) + return min_thousandth + + @staticmethod + def _add_summary_compare_data( node_data, compare_data_dict): + precision_status = True + max_relative_err = 0 + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + compare_data = compare_data_dict.get(key) + if compare_data: + # 对应比对结果csv的列 + key_list = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF, + CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR, + CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR] + headers = CompareConst.SUMMARY_COMPARE_RESULT_HEADER + id_list = [headers.index(x) for x in key_list] + ModeAdapter._match_data(value, compare_data, key_list, id_list) + # 相对误差大于0.5疑似有精度问题,小值域1e-3不比较相对误差 + for index, item in enumerate(key_list[4:]): + value_diff = value.get(key_list[index]) + if isinstance(value_diff, float) and value_diff != 0 and abs(value_diff) < GraphConst.SMALL_VALUE: + value[item] = ToolTip.SMALL_VALUE_TIP.format(key_list[index]) + continue + relative_err = str2float(value.get(item)) + max_relative_err = max(max_relative_err, relative_err) + node_data[key] = value + if max_relative_err > GraphConst.MAX_RELATIVE_ERR_TH: + precision_status = False + max_relative_err = 1 if max_relative_err > 1 else max_relative_err + precision_index = 1 - max_relative_err + return precision_status, precision_index + + @staticmethod + def _match_data(data_dict, compare_data, key_list, id_list): + """ + 绑定精度指标到node的input_data和output_data + """ + if len(key_list) != len(id_list): + return + for id, key in zip(id_list, key_list): + data = compare_data[id] + if data is not None and 'nan' not in str(data) and str(data) != ' ': + data_dict[key] = data + else: + data_dict[key] = 'null' + + def parse_result(self, node, compare_data_dict): + """ + 根据结果返回数据,分别是precision_status,precision_index,和附加数据 + """ + other_dict = {} + if self.is_md5_compare(): + precision_status_in = ModeAdapter._add_md5_compare_data(node.input_data, compare_data_dict[0]) + precision_status_out = ModeAdapter._add_md5_compare_data(node.output_data, compare_data_dict[1]) + # 所有输入输出md5对比通过,这个节点才算通过 + precision_status = precision_status_in and precision_status_out + precision_index = 1 if precision_status else 0 + other_result = CompareConst.PASS if precision_status else CompareConst.DIFF + other_dict[GraphConst.JSON_MD5_KEY] = other_result + elif self.is_summary_compare(): + precision_status_in, precision_index_in = ModeAdapter._add_summary_compare_data(node.input_data, compare_data_dict[0]) + precision_status_out, precision_index_out = ModeAdapter._add_summary_compare_data(node.output_data, compare_data_dict[1]) + precision_status = precision_status_in and precision_status_out + precision_index = min(precision_index_in, precision_index_out) + else: + min_thousandth_in = ModeAdapter._add_real_compare_data(node.input_data, compare_data_dict[0]) + min_thousandth_out = ModeAdapter._add_real_compare_data(node.output_data, compare_data_dict[0]) + if min_thousandth_in and min_thousandth_out: + change_percentage = abs(min_thousandth_in - min_thousandth_out) + else: + change_percentage = 0 + precision_status = True + if change_percentage > GraphConst.REAL_DATA_TH: + precision_status = False + precision_index = 0 if change_percentage > 1 else 1 - change_percentage + return precision_status, precision_index, other_dict + + def prepare_real_data(self, node): + """ + 为真实数据比较模式准备节点信息 + """ + if self.is_real_data_compare(): + self.compare_nodes.append(node) + return True + return False + + def is_summary_compare(self): + return self.compare_mode == GraphConst.SUMMARY_COMPARE + + def is_md5_compare(self): + return self.compare_mode == GraphConst.MD5_COMPARE + + def is_real_data_compare(self): + return self.compare_mode == GraphConst.REAL_DATA_COMPARE + + def add_csv_data(self, compare_result_list): + if not self.is_real_data_compare(): + return + self.csv_data.extend(compare_result_list) + + def add_error_key(self, node_data): + """ + 根据不同的模式进行提供不同错误信息 + """ + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + if self.is_summary_compare(): + message = [CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR, + CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR] + elif self.is_real_data_compare(): + message = [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] + else: + # 输出件优化 + message = [] + value[GraphConst.ERROR_KEY] = message + node_data[key] = value + + def get_tool_tip(self): + """ + 用于前端展示字段的具体含义 + """ + if self.is_summary_compare(): + tips = { + CompareConst.MAX_DIFF: ToolTip.MAX_DIFF, + CompareConst.MIN_DIFF: ToolTip.MIN_DIFF, + CompareConst.MEAN_DIFF: ToolTip.MEAN_DIFF, + CompareConst.NORM_DIFF: ToolTip.NORM_DIFF} + elif self.is_md5_compare(): + tips = {Const.MD5: ToolTip.MD5} + else: + tips = { + CompareConst.ONE_THOUSANDTH_ERR_RATIO: ToolTip.ONE_THOUSANDTH_ERR_RATIO, + CompareConst.COSINE: ToolTip.COSINE, + CompareConst.MAX_ABS_ERR: ToolTip.MAX_ABS_ERR, + CompareConst.MAX_RELATIVE_ERR: ToolTip.MAX_RELATIVE_ERR} + return tips diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/__init__.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py new file mode 100644 index 00000000000..f04f367f591 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py @@ -0,0 +1,107 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .node_op import NodeOp +from ..utils import Suggestions, GraphConst +from ..builder.msprobe_adapter import format_node_data, compare_data + + +class BaseNode: + def __init__(self, node_op, node_id, up_node=None): + self.op = node_op + self.id = node_id + self.data = {} + self.output_data = {} + self.input_data = {} + self.upnode = None + self.add_upnode(up_node) + self.subnodes = [] + self.matched_node_link = [] + self.suggestions = {} + + def __str__(self): + info = f'id:\t{self.id}' + return info + + def __eq__(self, other): + """ + 用来判断两个节点是否可以被匹配上,认为结构上是否一致 + """ + if not compare_data(self.input_data, other.input_data): + return False + if not compare_data(self.output_data, other.output_data): + return False + return True + + def get_suggestions(self): + """ + 精度疑似有问题时,提供一些建议 + """ + if self.op == NodeOp.module: + self.suggestions[GraphConst.SUGGEST_KEY] = Suggestions.Module + self.suggestions[Suggestions.PTDBG] = Suggestions.PTDBG_URL + elif self.op == NodeOp.function_api: + self.suggestions[GraphConst.SUGGEST_KEY] = Suggestions.API + self.suggestions[Suggestions.API_ACCURACY_CHECKER] = Suggestions.API_ACCURACY_CHECKER_URL + + def set_input_output(self, input_data, output_data): + self.input_data = input_data + self.output_data = output_data + + def add_upnode(self, node): + """ + 绑定upnode,用于对两个节点进行上下级关联 + """ + if not node or node.id == self.id or self.upnode: + return + self.upnode = node + node.subnodes.append(self) + + def add_link(self, node, ancestors): + """ + 在节点匹配成功后进行匹配数据的录入 + Args: + node: 和self相互匹配的节点 + ancestors: 对面节点的祖先信息 + """ + self.matched_node_link = ancestors + node.matched_node_link = ancestors + + def to_dict(self): + """ + 输出数据 + """ + result = {} + result['id'] = self.id + result['node_type'] = self.op.value + result['data'] = self.data + result['output_data'] = format_node_data(self.output_data) + result['input_data'] = format_node_data(self.input_data) + result['upnode'] = self.upnode.id if self.upnode else 'None' + result['subnodes'] = [node.id for node in self.subnodes] + result['matched_node_link'] = self.matched_node_link + result['suggestions'] = self.suggestions + return result + + def get_ancestors(self): + """ + 获取节点所有祖先的列表 + """ + ancestors = [] + current_node = self.upnode + while current_node: + ancestors.append(current_node.id) + current_node = current_node.upnode + return list(reversed(ancestors)) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py new file mode 100644 index 00000000000..6bae10ad3fc --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py @@ -0,0 +1,86 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base_node import BaseNode +from .node_op import NodeOp +from ..utils import GraphConst + + +class Graph: + def __init__(self, model_name): + self.node_map = {} + self.add_node(NodeOp.module, model_name) + self.root = self.get_node(model_name) + + def __str__(self): + infos = [f'{str(self.node_map.get(node_id))}' for node_id in self.node_map] + info = "\n".join(infos) + return info + + @staticmethod + def match(graph_n, node_n, graph_b): + """ + 给定节点n,在另一个graph中匹配它对应的节点。前置条件是它的父节点匹配已经完成 + 目前采用完全匹配的方式,后续可能在这里加入一定的模糊匹配逻辑 + 返回匹配结果,匹配到的节点,以及祖先树。没匹配到则返回None, [] + """ + if not node_n or node_n.id not in graph_b.node_map: + return None, [] + node_b = graph_b.node_map.get(node_n.id) + if node_n != node_b: + return None, [] + ancestors_n = node_n.get_ancestors() + ancestors_b = node_b.get_ancestors() + if ancestors_n != ancestors_b: + return None, [] + return node_b, ancestors_n + + @staticmethod + def dfs(node, result): + info = node.to_dict() + result[node.id] = info + for subnode in node.subnodes: + Graph.dfs(subnode, result) + + def add_node(self, node_op, node_id, up_node=None): + """ + 在graph中进行节点的添加 + Args: + node_op: 需要添加的节点类型 + node_id: 需要添加的节点id + up_node:对应节点的父节点 + """ + if node_id in self.node_map: + return + node = BaseNode(node_op, node_id, up_node) + self.node_map[node_id] = node + + def get_node(self, node_id): + """ + 返回节点,不存在返回None + """ + return self.node_map.get(node_id, None) + + def to_dict(self): + """ + 用于数据输出 + """ + result = {} + result[GraphConst.JSON_ROOT_KEY] = self.root.id if self.root else 'None' + result[GraphConst.JSON_NODE_KEY] = {} + for node_id in self.node_map: + info = self.node_map.get(node_id).to_dict() + result[GraphConst.JSON_NODE_KEY][node_id] = info + return result diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py new file mode 100644 index 00000000000..1629caabd19 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +import re +from ..builder.msprobe_adapter import op_patterns + + +class NodeOp(Enum): + module = 0 + function_api = 1 + + @staticmethod + def get_node_op(node_name: str): + """ + 基于代表节点的字符串,解析节点种类 + """ + for op in NodeOp: + index = op.value + if index < 0 or index >= len(op_patterns): + raise Exception("NodeOp and op_patterns in MsprobeAdapter do not match") + pattern = op_patterns[index] + if re.match(pattern, node_name): + return op + raise Exception(f"Cannot parse node_name {node_name} into NodeOp") diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/test.py b/debug/accuracy_tools/msprobe/pytorch/visualization/test.py new file mode 100644 index 00000000000..165d54ce17e --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/test.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import shutil +import filecmp +from .compare.graph_comparator import GraphComparator +from .utils import GraphConst +from .builder.graph_builder import GraphBuilder +from ...pytorch.common.log import logger +from ...core.common.file_check import create_directory + + +def compare_graph(dump_path_n, dump_path_b, out_path): + # 对两个数据进行构图 + construct_path_n = os.path.join(dump_path_n, GraphConst.CONSTRUCT_FILE) + construct_path_b = os.path.join(dump_path_b, GraphConst.CONSTRUCT_FILE) + data_path_n = os.path.join(dump_path_n, GraphConst.DUMP_FILE) + data_path_b = os.path.join(dump_path_b, GraphConst.DUMP_FILE) + graph_n = GraphBuilder.build(construct_path_n, data_path_n, 'TestNet') + graph_b = GraphBuilder.build(construct_path_b, data_path_b, 'TestNet') + # 基于graph、stack和data进行比较 + stack_path = os.path.join(dump_path_n, GraphConst.STACK_FILE) + graph_comparator = GraphComparator([graph_n, graph_b], [data_path_n, data_path_b], stack_path, out_path) + graph_comparator.compare() + output_path = os.path.join(out_path, 'compare.vis') + GraphBuilder.to_json(output_path, graph_n, graph_b, graph_comparator.ma.get_tool_tip()) + + +def build_graph(dump_path, out_path): + construct_path = os.path.join(dump_path, GraphConst.CONSTRUCT_FILE) + data_path = os.path.join(dump_path, GraphConst.DUMP_FILE) + output_path = os.path.join(out_path, 'build.vis') + graph = GraphBuilder.build(construct_path, data_path, 'TestNet') + GraphBuilder.to_json(output_path, graph) + + +def run_st(data_path): + start_time = time.time() + run_bench(data_path, 'output2') + end_time = time.time() + logger.info(f'run_st time cost: {end_time - start_time}') + # 比较output2的结果和output1 的bench结果差距 + for data_dir in os.listdir(data_path): + data_dir = os.path.join(data_path, data_dir) + if not os.path.isdir(data_dir): + continue + output1 = os.path.join(data_dir, 'output1') + output2 = os.path.join(data_dir, 'output2') + files = ['build.vis', 'compare.vis'] + for vis_file in files: + file1 = os.path.join(output1, vis_file) + file2 = os.path.join(output2, vis_file) + result = filecmp.cmp(file1, file2) + if result: + logger.info('pass ' + file1) + else: + logger.info('not pass ' + file1) + + +def run_bench(data_path, output_dir): + for data_dir in os.listdir(data_path): + data_dir = os.path.join(data_path, data_dir) + if not os.path.isdir(data_dir): + continue + run_data_path = os.path.join(data_dir, 'data') + output_path = os.path.join(data_dir, output_dir) + if os.path.exists(output_path): + shutil.rmtree(output_path) + create_directory(output_path) + build_graph(run_data_path, output_path) + compare_graph(run_data_path, run_data_path, output_path) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py new file mode 100644 index 00000000000..e517377bc58 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from ...core.common.file_check import FileOpen +from ..compare.acc_compare import result_to_csv + + +def load_json_file(file_path): + """ + 加载json文件 + """ + try: + with FileOpen(file_path, 'r') as f: + file_dict = json.load(f) + if not isinstance(file_dict, dict): + return {} + return file_dict + except json.JSONDecodeError: + return {} + + +def load_data_json_file(file_path): + """ + 加载dump.json中的data字段 + """ + return load_json_file(file_path).get(GraphConst.DATA_KEY, {}) + + +def save_json_file(file_path, data): + """ + 保存json文件 + """ + with FileOpen(file_path, 'w') as f: + f.write(json.dumps(data, indent=4)) + + +def get_csv_df(md5_compare, summary_compare, stack, csv_data): + """ + 调用acc接口写入csv + """ + return result_to_csv(md5_compare, summary_compare, stack, csv_data) + + +def str2float(percentage_str): + """ + 百分比字符串转换转换为浮点型 + Args: + percentage_str: '0.00%', '23.4%' + Returns: float 0.00, 0.234 + """ + try: + percentage_str = percentage_str.strip('%') + return float(percentage_str) / 100 + except ValueError: + return 0 + + +class ToolTip: + MAX_DIFF = 'NPU与标杆API统计信息比对,最大值的差值' + MIN_DIFF = 'NPU与标杆API统计信息比对,最小值的差值' + MEAN_DIFF = 'NPU与标杆API统计信息比对,平均值的差值' + NORM_DIFF = 'NPU与标杆API统计信息比对,2范数(平方根)的差值' + MD5 = '数据MD5信息,用于比较两个数据信息是否完全一致' + ONE_THOUSANDTH_ERR_RATIO = 'Tensor中的元素逐个与对应的标杆数据对比,相对误差大于千分之一的比例占总元素个数的比例小于千分之一' + COSINE = '通过计算两个向量的余弦值来判断其相似度,数值越接近于1说明计算出的两个张量越相似,实际可接受阈值为大于0.99。在计算中可能会存在nan,主要由于可能会出现其中一个向量为0' + MAX_ABS_ERR = '当最大绝对误差越接近0表示其计算的误差越小,实际可接受阈值为小于0.001' + MAX_RELATIVE_ERR = '当最大相对误差越接近0表示其计算的误差越小。当dump数据中存在0或Nan时,比对结果中最大相对误差则出现inf或Nan的情况,属于正常现象' + SMALL_VALUE_TIP = '{} 小于1e-3,不计算相对误差' + + +class Suggestions: + Module = '此模块精度比对结果疑似异常,请使用ptdbg工具对模块中的api进行dump比对' + API = '此api精度比对结果疑似异常,请使用api accuracy checker工具对api进行精度检测' + PTDBG = 'ptdbg工具' + PTDBG_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend' + API_ACCURACY_CHECKER = 'api accuracy checker工具' + API_ACCURACY_CHECKER_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker' + + +class GraphConst: + CONSTRUCT_FILE = 'construct.json' + DUMP_FILE = 'dump.json' + STACK_FILE = 'stack.json' + GRAPH_FILE = 'graph.vis' + ERROR_KEY = 'error_key' + SUMMARY_COMPARE = 0 + MD5_COMPARE = 1 + REAL_DATA_COMPARE = 2 + JSON_NPU_KEY = 'NPU' + JSON_BENCH_KEY = 'Bench' + JSON_TIP_KEY = 'Tooltip' + JSON_MD5_KEY = 'md5 Compare Result' + JSON_ROOT_KEY = 'root' + JSON_NODE_KEY = 'node' + DATA_KEY = 'data' + REAL_DATA_TH = 0.1 + MAX_RELATIVE_ERR_TH = 0.5 + ROUND_TH = 6 + JSON_STATUS_KEY = 'precision_status' + JSON_INDEX_KEY = 'precision_index' + SUGGEST_KEY = 'text' + TAG_NA = 'na' + OUTPUT_INDEX_TWO = -2 + OUTPUT_INDEX_THREE = -3 + OUTPUT_MIN_LEN = 3 + INPUT = 'input' + OUTPUT = 'output' + STR_MAX_LEN = 50 + SMALL_VALUE = 1e-3 -- Gitee From 0c0c60b725f39eaa55905dfee42ab7381551eb67 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Tue, 6 Aug 2024 20:46:52 +0800 Subject: [PATCH 31/94] =?UTF-8?q?=E5=B0=86=E5=88=86=E7=BA=A7=E5=8F=AF?= =?UTF-8?q?=E8=A7=86=E5=8C=96=E4=BB=A3=E7=A0=81=E6=90=AC=E8=87=B3poc?= =?UTF-8?q?=E5=88=86=E6=94=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pytorch/visualization/graph/base_node.py | 2 +- .../msprobe/pytorch/visualization/test.py | 52 ++----------------- .../msprobe/pytorch/visualization/utils.py | 12 ++--- 3 files changed, 12 insertions(+), 54 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py index f04f367f591..8032ffecede 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py @@ -51,7 +51,7 @@ class BaseNode: """ if self.op == NodeOp.module: self.suggestions[GraphConst.SUGGEST_KEY] = Suggestions.Module - self.suggestions[Suggestions.PTDBG] = Suggestions.PTDBG_URL + self.suggestions[Suggestions.DUMP] = Suggestions.DUMP_URL elif self.op == NodeOp.function_api: self.suggestions[GraphConst.SUGGEST_KEY] = Suggestions.API self.suggestions[Suggestions.API_ACCURACY_CHECKER] = Suggestions.API_ACCURACY_CHECKER_URL diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/test.py b/debug/accuracy_tools/msprobe/pytorch/visualization/test.py index 165d54ce17e..6f78fe49679 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/test.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/test.py @@ -14,24 +14,19 @@ # limitations under the License. import os -import time -import shutil -import filecmp from .compare.graph_comparator import GraphComparator from .utils import GraphConst from .builder.graph_builder import GraphBuilder -from ...pytorch.common.log import logger -from ...core.common.file_check import create_directory -def compare_graph(dump_path_n, dump_path_b, out_path): +def compare_graph(dump_path_n, dump_path_b, out_path, model_name='TestNet'): # 对两个数据进行构图 construct_path_n = os.path.join(dump_path_n, GraphConst.CONSTRUCT_FILE) construct_path_b = os.path.join(dump_path_b, GraphConst.CONSTRUCT_FILE) data_path_n = os.path.join(dump_path_n, GraphConst.DUMP_FILE) data_path_b = os.path.join(dump_path_b, GraphConst.DUMP_FILE) - graph_n = GraphBuilder.build(construct_path_n, data_path_n, 'TestNet') - graph_b = GraphBuilder.build(construct_path_b, data_path_b, 'TestNet') + graph_n = GraphBuilder.build(construct_path_n, data_path_n, model_name) + graph_b = GraphBuilder.build(construct_path_b, data_path_b, model_name) # 基于graph、stack和data进行比较 stack_path = os.path.join(dump_path_n, GraphConst.STACK_FILE) graph_comparator = GraphComparator([graph_n, graph_b], [data_path_n, data_path_b], stack_path, out_path) @@ -40,46 +35,9 @@ def compare_graph(dump_path_n, dump_path_b, out_path): GraphBuilder.to_json(output_path, graph_n, graph_b, graph_comparator.ma.get_tool_tip()) -def build_graph(dump_path, out_path): +def build_graph(dump_path, out_path, model_name='TestNet'): construct_path = os.path.join(dump_path, GraphConst.CONSTRUCT_FILE) data_path = os.path.join(dump_path, GraphConst.DUMP_FILE) output_path = os.path.join(out_path, 'build.vis') - graph = GraphBuilder.build(construct_path, data_path, 'TestNet') + graph = GraphBuilder.build(construct_path, data_path, model_name) GraphBuilder.to_json(output_path, graph) - - -def run_st(data_path): - start_time = time.time() - run_bench(data_path, 'output2') - end_time = time.time() - logger.info(f'run_st time cost: {end_time - start_time}') - # 比较output2的结果和output1 的bench结果差距 - for data_dir in os.listdir(data_path): - data_dir = os.path.join(data_path, data_dir) - if not os.path.isdir(data_dir): - continue - output1 = os.path.join(data_dir, 'output1') - output2 = os.path.join(data_dir, 'output2') - files = ['build.vis', 'compare.vis'] - for vis_file in files: - file1 = os.path.join(output1, vis_file) - file2 = os.path.join(output2, vis_file) - result = filecmp.cmp(file1, file2) - if result: - logger.info('pass ' + file1) - else: - logger.info('not pass ' + file1) - - -def run_bench(data_path, output_dir): - for data_dir in os.listdir(data_path): - data_dir = os.path.join(data_path, data_dir) - if not os.path.isdir(data_dir): - continue - run_data_path = os.path.join(data_dir, 'data') - output_path = os.path.join(data_dir, output_dir) - if os.path.exists(output_path): - shutil.rmtree(output_path) - create_directory(output_path) - build_graph(run_data_path, output_path) - compare_graph(run_data_path, run_data_path, output_path) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py index e517377bc58..15def154edb 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py @@ -82,12 +82,12 @@ class ToolTip: class Suggestions: - Module = '此模块精度比对结果疑似异常,请使用ptdbg工具对模块中的api进行dump比对' - API = '此api精度比对结果疑似异常,请使用api accuracy checker工具对api进行精度检测' - PTDBG = 'ptdbg工具' - PTDBG_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend' - API_ACCURACY_CHECKER = 'api accuracy checker工具' - API_ACCURACY_CHECKER_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker' + Module = '此模块精度比对结果疑似异常,请使用msprobe工具的数据采集功能对模块中的api进行dump比对' + API = '此api精度比对结果疑似异常,请使用msprobe工具的预检功能对api进行精度检测' + DUMP = 'msprobe工具的数据采集功能' + DUMP_URL = 'https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/pytorch/doc/dump.md' + API_ACCURACY_CHECKER = 'msprobe工具的预检功能' + API_ACCURACY_CHECKER_URL = 'https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/pytorch/doc/api_accuracy_checker.md' class GraphConst: -- Gitee From cc8a09aabeebac53363dc207d7f57eac8e0a5497 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Tue, 6 Aug 2024 20:51:02 +0800 Subject: [PATCH 32/94] =?UTF-8?q?rename=E6=9E=84=E5=9B=BE=E5=85=A5?= =?UTF-8?q?=E5=8F=A3=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/pytorch/visualization/{test.py => graph_service.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename debug/accuracy_tools/msprobe/pytorch/visualization/{test.py => graph_service.py} (100%) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/test.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph_service.py similarity index 100% rename from debug/accuracy_tools/msprobe/pytorch/visualization/test.py rename to debug/accuracy_tools/msprobe/pytorch/visualization/graph_service.py -- Gitee From d69e8ac9cd5e38770c0de5474a63e129bca5efb2 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 7 Aug 2024 01:21:15 +0000 Subject: [PATCH 33/94] fix bugg in csv header --- debug/accuracy_tools/kj600/kj600/module_metric.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 8145e5d68d4..456776884af 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -171,7 +171,7 @@ def write_metrics_tensorboard(ops, summary_writer, metric_value, step, prefix='' except KeyError as e: raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e -def write_metrics_csv(ops, summary_writer, metric_value, step, preifx=''): +def write_metrics_csv(ops, summary_writer, metric_value, step, prefix=''): for metric_name in ops: try: fun_metric = config_metric_registry[metric_name] @@ -182,11 +182,9 @@ def write_metrics_csv(ops, summary_writer, metric_value, step, preifx=''): raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e if not summary_writer.header: - keys = metric_value[0][metric_name].keys() - if len(keys) > 1: - summary_writer.header = ['param_name'] - for key in keys: - summary_writer.header.extend([f'{key.split("/")[-1]}_{op}' for op in ops]) + if prefix in ['actv', 'grad_actv']: + summary_writer.header = ['param_name'] + ['input_'+op for op in ops] + ['output_'+op for op in ops] else: summary_writer.header = ['param_name'] + ops - summary_writer.write_csv(preifx, step) \ No newline at end of file + summary_writer.write_csv(prefix, step) + summary_writer.header = [] \ No newline at end of file -- Gitee From f7ec77908e57d08a0751306ef756e6cfcb526664 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 7 Aug 2024 01:21:29 +0000 Subject: [PATCH 34/94] less verbose --- debug/accuracy_tools/kj600/kj600/module_hook.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 3c1b5492cc7..6935312aeb8 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -192,7 +192,10 @@ class TrainerMon: raise Exception("ur_distribution cannot be enabled with unknown optimizer.") if self.mv_distribution: raise Exception("mv_distribution cannot be enabled with unknown optimizer.") + self.verbose = False self.print_struct = self.config.get("print_struct", False) + if self.print_struct: + self.verbose = True self.struct_printed = False self.module_struct = {} return @@ -262,7 +265,7 @@ class TrainerMon: self._register_param_name(model) self._hook_model_for_grad_acc(model) - # self._hook_weights() + # self._hook_weighfts() self.hook_modules(model[0], grad_acc_steps) @@ -315,7 +318,8 @@ class TrainerMon: # print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") # self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') # bwd_context.actvgrad.clear() - + if not len(self.grad_context.actv) == self.micro_batch_number: + print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") self.write_metrics(self.ops, self.summary_writer, self.grad_context.actv, step, 'grad_actv') self.grad_context.actv.clear() @@ -425,6 +429,8 @@ class TrainerMon: return def _smallest_rank_print(self, msg): + if not self.verbose: + return if dist.is_initialized(): if self.module_rank_list: if dist.get_rank() == min(self.module_rank_list): -- Gitee From 68be3da799c5bb6c8dee296c4d89c734293b4ef5 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 7 Aug 2024 02:47:06 +0000 Subject: [PATCH 35/94] support vpp in module hook --- .../accuracy_tools/kj600/kj600/module_hook.py | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 6935312aeb8..d32a6f8cb15 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -220,17 +220,18 @@ class TrainerMon: def hook_modules(self, model:torch.nn.Module, grad_acc_steps): # fwd=0, bkd=1 # targets is module name list like ["xx.xxx1", "xxx.xxx2"] which can be obtained when first run. - print_rank_0("> module names:") - for name, _ in model.named_modules(): - print_rank_0(f"\t{name}") - self.micro_batch_number = grad_acc_steps + if (dist.is_initialized() and dist.get_rank() not in self.module_rank_list): + return + + if not isinstance(model, list): + model = [model] - if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list): - targets = [x for x, _ in model.named_modules()] if self.print_struct else self.config['targets'].keys() - hooked_count = self._hook_module(targets, model, fwd_or_bkd=0) + self.micro_batch_number = grad_acc_steps + for vpp_stage, model_chunk in enumerate(model): + vpp_stage = f'{vpp_stage}_' if self.vpp else '' + targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config['targets'].keys() + hooked_count = self._hook_module(targets, model_chunk, vpp_stage) print_rank_0(f"> {hooked_count} out of {len(self.config['targets'])} are monitored.") - else: - return if not self.optimizer_hooked: print_rank_0("> parameter names:") @@ -265,8 +266,8 @@ class TrainerMon: self._register_param_name(model) self._hook_model_for_grad_acc(model) - # self._hook_weighfts() - self.hook_modules(model[0], grad_acc_steps) + # self._hook_weights() + self.hook_modules(model, grad_acc_steps) def build_tbtag_tensor_map(self, module_name, tag, tensor): @@ -319,7 +320,7 @@ class TrainerMon: # self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') # bwd_context.actvgrad.clear() if not len(self.grad_context.actv) == self.micro_batch_number: - print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") + print_warn_log(f"grad_context.actv not equal to micro_batch_number: {len(self.grad_context.actv)}, {self.micro_batch_number}") self.write_metrics(self.ops, self.summary_writer, self.grad_context.actv, step, 'grad_actv') self.grad_context.actv.clear() @@ -463,7 +464,7 @@ class TrainerMon: self.param_registered = True - def _hook_module(self, target_names, module: torch.nn.Module, fwd_or_bkd): + def _hook_module(self, target_names, module: torch.nn.Module, vpp_stage=''): if '_modules' not in module.__dict__: # nothing to hook return 0 @@ -550,10 +551,14 @@ class TrainerMon: context.step += 1 return + if self.backward_only and self.forward_only: + print_warn_log('not enable backward_only and forward_only simultaneously') + hooked_count = 0 - for name, submodule in module.named_modules(): + for module_name, submodule in module.named_modules(): + name = vpp_stage + module_name self.module_struct[name] = {} - if name in target_names: + if name in target_names or module_name in target_names: if not self.backward_only: submodule.register_forward_hook(fwd_hook_fun) self.module_fwd_hook_context_by_module[submodule] = ModuleHookContext(name) -- Gitee From 12d7fd6d444335ba567b868f2b93b49f62d0fbe1 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Wed, 7 Aug 2024 14:13:09 +0800 Subject: [PATCH 36/94] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pytorch/visualization/builder/graph_builder.py | 8 ++++---- .../pytorch/visualization/builder/msprobe_adapter.py | 6 +++--- .../pytorch/visualization/compare/graph_comparator.py | 8 ++++---- .../msprobe/pytorch/visualization/compare/mode_adapter.py | 6 +++--- .../msprobe/pytorch/visualization/graph/base_node.py | 6 +++--- .../msprobe/pytorch/visualization/graph/graph.py | 6 +++--- .../msprobe/pytorch/visualization/graph/node_op.py | 2 +- .../msprobe/pytorch/visualization/graph_service.py | 6 +++--- .../accuracy_tools/msprobe/pytorch/visualization/utils.py | 1 - 9 files changed, 24 insertions(+), 25 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py index f623a48ae3b..6366b752043 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/graph_builder.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..graph.graph import Graph -from ..graph.node_op import NodeOp -from ..utils import load_json_file, load_data_json_file, save_json_file, GraphConst -from .msprobe_adapter import get_input_output +from msprobe.pytorch.visualization.graph.graph import Graph +from msprobe.pytorch.visualization.graph.node_op import NodeOp +from msprobe.pytorch.visualization.utils import load_json_file, load_data_json_file, save_json_file, GraphConst +from msprobe.pytorch.visualization.builder.msprobe_adapter import get_input_output class GraphBuilder: diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py index b21963a5976..fa0cc5de0a4 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/builder/msprobe_adapter.py @@ -14,9 +14,9 @@ # limitations under the License. import re -from ...compare.acc_compare import read_op, merge_tensor, get_accuracy, _do_multi_process -from ....core.common.utils import task_dumppath_get -from ..utils import GraphConst +from msprobe.pytorch.compare.acc_compare import read_op, merge_tensor, get_accuracy, _do_multi_process +from msprobe.core.common.utils import task_dumppath_get +from msprobe.pytorch.visualization.utils import GraphConst # 用于将节点名字解析成对应的NodeOp的规则 diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py index 9346f9e9bb9..2465cf1ae0d 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/graph_comparator.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data -from ..utils import GraphConst, load_json_file, load_data_json_file, get_csv_df -from ..graph.graph import Graph -from .mode_adapter import ModeAdapter +from msprobe.pytorch.visualization.builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data +from msprobe.pytorch.visualization.utils import GraphConst, load_json_file, load_data_json_file, get_csv_df +from msprobe.pytorch.visualization.graph.graph import Graph +from msprobe.pytorch.visualization.compare.mode_adapter import ModeAdapter class GraphComparator: diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py index d58f2078b6f..f5d74d27589 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py @@ -32,12 +32,12 @@ class ModeAdapter: continue compare_data = compare_data_dict.get(key) if compare_data: - key_list = [GraphConst.JSON_MD5_KEY] + key_list = [CompareConst.RESULT] headers = CompareConst.MD5_COMPARE_RESULT_HEADER id_list = [headers.index(x) for x in key_list] ModeAdapter._match_data(value, compare_data, key_list, id_list) # md5比对是否通过 - if value.get(GraphConst.JSON_MD5_KEY) != CompareConst.PASS: + if value.get(CompareConst.RESULT) != CompareConst.PASS: precision_status = False node_data[key] = value return precision_status @@ -130,7 +130,7 @@ class ModeAdapter: precision_status = precision_status_in and precision_status_out precision_index = 1 if precision_status else 0 other_result = CompareConst.PASS if precision_status else CompareConst.DIFF - other_dict[GraphConst.JSON_MD5_KEY] = other_result + other_dict[CompareConst.RESULT] = other_result elif self.is_summary_compare(): precision_status_in, precision_index_in = ModeAdapter._add_summary_compare_data(node.input_data, compare_data_dict[0]) precision_status_out, precision_index_out = ModeAdapter._add_summary_compare_data(node.output_data, compare_data_dict[1]) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py index 8032ffecede..e8c86e243e0 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/base_node.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .node_op import NodeOp -from ..utils import Suggestions, GraphConst -from ..builder.msprobe_adapter import format_node_data, compare_data +from msprobe.pytorch.visualization.graph.node_op import NodeOp +from msprobe.pytorch.visualization.utils import Suggestions, GraphConst +from msprobe.pytorch.visualization.builder.msprobe_adapter import format_node_data, compare_data class BaseNode: diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py index 6bae10ad3fc..57191d5e64e 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/graph.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .base_node import BaseNode -from .node_op import NodeOp -from ..utils import GraphConst +from msprobe.pytorch.visualization.graph.base_node import BaseNode +from msprobe.pytorch.visualization.graph.node_op import NodeOp +from msprobe.pytorch.visualization.utils import GraphConst class Graph: diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py index 1629caabd19..441f6e40fb7 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph/node_op.py @@ -15,7 +15,7 @@ from enum import Enum import re -from ..builder.msprobe_adapter import op_patterns +from msprobe.pytorch.visualization.builder.msprobe_adapter import op_patterns class NodeOp(Enum): diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/graph_service.py b/debug/accuracy_tools/msprobe/pytorch/visualization/graph_service.py index 6f78fe49679..376367ab06e 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/graph_service.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/graph_service.py @@ -14,9 +14,9 @@ # limitations under the License. import os -from .compare.graph_comparator import GraphComparator -from .utils import GraphConst -from .builder.graph_builder import GraphBuilder +from msprobe.pytorch.visualization.compare.graph_comparator import GraphComparator +from msprobe.pytorch.visualization.utils import GraphConst +from msprobe.pytorch.visualization.builder.graph_builder import GraphBuilder def compare_graph(dump_path_n, dump_path_b, out_path, model_name='TestNet'): diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py index 15def154edb..21f4152abe7 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py @@ -102,7 +102,6 @@ class GraphConst: JSON_NPU_KEY = 'NPU' JSON_BENCH_KEY = 'Bench' JSON_TIP_KEY = 'Tooltip' - JSON_MD5_KEY = 'md5 Compare Result' JSON_ROOT_KEY = 'root' JSON_NODE_KEY = 'node' DATA_KEY = 'data' -- Gitee From 83cec2451342cc10fec98c822853d54f3873e3f7 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Wed, 7 Aug 2024 14:16:59 +0800 Subject: [PATCH 37/94] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/pytorch/visualization/compare/mode_adapter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py index f5d74d27589..bd272271380 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py @@ -13,9 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json -from ....core.common.const import CompareConst, Const -from ..utils import ToolTip, GraphConst, str2float +from msprobe.core.common.const import CompareConst, Const +from msprobe.pytorch.visualization.utils import ToolTip, GraphConst, str2float class ModeAdapter: -- Gitee From d7f28245cca9dfb46b671208e7353dc89ee59e0e Mon Sep 17 00:00:00 2001 From: l30044004 Date: Wed, 7 Aug 2024 14:28:23 +0800 Subject: [PATCH 38/94] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/msprobe/pytorch/visualization/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py index 21f4152abe7..546b1bfea62 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py @@ -14,8 +14,8 @@ # limitations under the License. import json -from ...core.common.file_check import FileOpen -from ..compare.acc_compare import result_to_csv +from msprobe.core.common.file_check import FileOpen +from msprobe.pytorch.compare.acc_compare import result_to_csv def load_json_file(file_path): -- Gitee From 01bee89a39e252c405a00c1b70e8aeb064069a6b Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 7 Aug 2024 07:22:43 +0000 Subject: [PATCH 39/94] add base writer class --- .../kj600/kj600/anomaly_detect.py | 60 ++++++++----------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 5a98aabb863..5eb28fc79f7 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -63,7 +63,28 @@ class bcolors: UNDERLINE = '\033[4m' -class CSVWriterWithAD: +class BaseWriterWithAD: + def __init__(self): + pass + + def _add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): + new_avg = avg = scalar_value + if tag in self.tag2scalars: + N = len(self.tag2scalars[tag]) + _, avg = self.tag2scalars[tag][-1] + new_avg = (avg*N + scalar_value)/(N + 1) + self.tag2scalars[tag].append((scalar_value, new_avg)) + detected, rule_name = self._ad(scalar_value, history=avg) + if detected: + print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") + exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" + if self.anomaly_inform: + self.anomaly_inform.run(exception_message, self.job_id) + + def _ad(self, scalar_value, history): + return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) + +class CSVWriterWithAD(BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False): self.path = path create_directory(path) @@ -89,28 +110,12 @@ class CSVWriterWithAD: self.context_dict = defaultdict(list) def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): - new_avg = avg = scalar_value - if tag in self.tag2scalars: - N = len(self.tag2scalars[tag]) - _, avg = self.tag2scalars[tag][-1] - new_avg = (avg*N + scalar_value)/(N + 1) - self.tag2scalars[tag].append((scalar_value, new_avg)) - detected, rule_name = self._ad(scalar_value, history=avg) - if detected: - print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") - exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" - if self.anomaly_inform: - self.anomaly_inform.run(exception_message, self.job_id) + super()._add_scalar(tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False) name = tag.split('/')[0] self.context_dict[name].append(scalar_value) - def _ad(self, scalar_value, history): - return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) - - - -class SummaryWriterWithAD(SummaryWriter): +class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False): super().__init__(path) self.tag2scalars = defaultdict(list) @@ -119,19 +124,6 @@ class SummaryWriterWithAD(SummaryWriter): self.anomaly_inform = anomaly_inform def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): - new_avg = avg = scalar_value - if tag in self.tag2scalars: - N = len(self.tag2scalars[tag]) - _, avg = self.tag2scalars[tag][-1] - new_avg = (avg*N + scalar_value)/(N + 1) - self.tag2scalars[tag].append((scalar_value, new_avg)) - detected, rule_name = self._ad(scalar_value, history=avg) - if detected: - print_info_log(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") - exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" - if self.anomaly_inform: - self.anomaly_inform.run(exception_message, self.job_id) + super()._add_scalar(tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False) return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) - - def _ad(self, scalar_value, history): - return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) + \ No newline at end of file -- Gitee From 9d7ab281c02ad7bdac4781df78101789d88858ad Mon Sep 17 00:00:00 2001 From: l30044004 Date: Wed, 7 Aug 2024 16:00:27 +0800 Subject: [PATCH 40/94] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../visualization/compare/mode_adapter.py | 19 ++++++++----------- .../msprobe/pytorch/visualization/utils.py | 12 ++++++++++-- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py index bd272271380..b3f286c28f2 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/compare/mode_adapter.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json from msprobe.core.common.const import CompareConst, Const from msprobe.pytorch.visualization.utils import ToolTip, GraphConst, str2float @@ -31,10 +32,9 @@ class ModeAdapter: continue compare_data = compare_data_dict.get(key) if compare_data: - key_list = [CompareConst.RESULT] headers = CompareConst.MD5_COMPARE_RESULT_HEADER - id_list = [headers.index(x) for x in key_list] - ModeAdapter._match_data(value, compare_data, key_list, id_list) + id_list = [headers.index(x) for x in GraphConst.MD5_INDEX_LIST] + ModeAdapter._match_data(value, compare_data, GraphConst.MD5_INDEX_LIST, id_list) # md5比对是否通过 if value.get(CompareConst.RESULT) != CompareConst.PASS: precision_status = False @@ -50,11 +50,9 @@ class ModeAdapter: continue compare_data = compare_data_dict.get(key) if compare_data: - key_list = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, - CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] headers = CompareConst.COMPARE_RESULT_HEADER - id_list = [headers.index(x) for x in key_list] - ModeAdapter._match_data(value, compare_data, key_list, id_list) + id_list = [headers.index(x) for x in GraphConst.REAL_DATA_INDEX_LIST] + ModeAdapter._match_data(value, compare_data, GraphConst.REAL_DATA_INDEX_LIST, id_list) # 获取一个节点所有的输入或输出最小的双千指标 thousandth = value.get(CompareConst.ONE_THOUSANDTH_ERR_RATIO) # 可能是None,可能是非数字内容str @@ -82,9 +80,7 @@ class ModeAdapter: compare_data = compare_data_dict.get(key) if compare_data: # 对应比对结果csv的列 - key_list = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF, - CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR, - CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR] + key_list = GraphConst.SUMMARY_INDEX_LIST headers = CompareConst.SUMMARY_COMPARE_RESULT_HEADER id_list = [headers.index(x) for x in key_list] ModeAdapter._match_data(value, compare_data, key_list, id_list) @@ -204,7 +200,8 @@ class ModeAdapter: else: tips = { CompareConst.ONE_THOUSANDTH_ERR_RATIO: ToolTip.ONE_THOUSANDTH_ERR_RATIO, + CompareConst.FIVE_THOUSANDTHS_ERR_RATIO: ToolTip.FIVE_THOUSANDTHS_ERR_RATIO, CompareConst.COSINE: ToolTip.COSINE, CompareConst.MAX_ABS_ERR: ToolTip.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR: ToolTip.MAX_RELATIVE_ERR} - return tips + return json.dumps(tips) diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py index 546b1bfea62..6721e14c7c2 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py @@ -15,6 +15,7 @@ import json from msprobe.core.common.file_check import FileOpen +from msprobe.core.common.const import CompareConst from msprobe.pytorch.compare.acc_compare import result_to_csv @@ -74,7 +75,8 @@ class ToolTip: MEAN_DIFF = 'NPU与标杆API统计信息比对,平均值的差值' NORM_DIFF = 'NPU与标杆API统计信息比对,2范数(平方根)的差值' MD5 = '数据MD5信息,用于比较两个数据信息是否完全一致' - ONE_THOUSANDTH_ERR_RATIO = 'Tensor中的元素逐个与对应的标杆数据对比,相对误差大于千分之一的比例占总元素个数的比例小于千分之一' + ONE_THOUSANDTH_ERR_RATIO = 'Tensor中的元素逐个与对应的标杆数据对比,相对误差小于千分之一的比例占总元素个数的比例,比例越接近1越好' + FIVE_THOUSANDTHS_ERR_RATIO = 'Tensor中的元素逐个与对应的标杆数据对比,相对误差小于千分之五的比例占总元素个数的比例,比例越接近1越好' COSINE = '通过计算两个向量的余弦值来判断其相似度,数值越接近于1说明计算出的两个张量越相似,实际可接受阈值为大于0.99。在计算中可能会存在nan,主要由于可能会出现其中一个向量为0' MAX_ABS_ERR = '当最大绝对误差越接近0表示其计算的误差越小,实际可接受阈值为小于0.001' MAX_RELATIVE_ERR = '当最大相对误差越接近0表示其计算的误差越小。当dump数据中存在0或Nan时,比对结果中最大相对误差则出现inf或Nan的情况,属于正常现象' @@ -101,7 +103,7 @@ class GraphConst: REAL_DATA_COMPARE = 2 JSON_NPU_KEY = 'NPU' JSON_BENCH_KEY = 'Bench' - JSON_TIP_KEY = 'Tooltip' + JSON_TIP_KEY = 'ToolTip' JSON_ROOT_KEY = 'root' JSON_NODE_KEY = 'node' DATA_KEY = 'data' @@ -119,3 +121,9 @@ class GraphConst: OUTPUT = 'output' STR_MAX_LEN = 50 SMALL_VALUE = 1e-3 + MD5_INDEX_LIST = [CompareConst.RESULT] + REAL_DATA_INDEX_LIST = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, + CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] + SUMMARY_INDEX_LIST = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF, + CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR, + CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR] -- Gitee From ac76baa4e3f71e26a9286bc4131362e50649fea4 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Wed, 7 Aug 2024 16:45:36 +0800 Subject: [PATCH 41/94] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/msprobe/pytorch/__init__.py | 1 + debug/accuracy_tools/msprobe/pytorch/visualization/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/__init__.py b/debug/accuracy_tools/msprobe/pytorch/__init__.py index 482e850f7ba..58ab1ac35a3 100644 --- a/debug/accuracy_tools/msprobe/pytorch/__init__.py +++ b/debug/accuracy_tools/msprobe/pytorch/__init__.py @@ -2,3 +2,4 @@ from .debugger.precision_debugger import PrecisionDebugger from .common.utils import seed_all from .compare.acc_compare import compare from .compare.distributed_compare import compare_distributed +from .visualization.graph_service import compare_graph, build_graph diff --git a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py index 6721e14c7c2..a1b72ee8500 100644 --- a/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/visualization/utils.py @@ -125,5 +125,5 @@ class GraphConst: REAL_DATA_INDEX_LIST = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] SUMMARY_INDEX_LIST = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF, - CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR, - CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR] + CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR, + CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR] -- Gitee From fc46648f455bc8538735d9d85b29a27f24fae158 Mon Sep 17 00:00:00 2001 From: jiandaobao Date: Wed, 7 Aug 2024 17:23:22 +0800 Subject: [PATCH 42/94] Repair in response to review comments --- .../accuracy_tools/kj600/kj600/anomaly_analyse.py | 14 ++++++-------- .../kj600/kj600/distributed/wrap_distributed.py | 14 +++++++------- debug/accuracy_tools/kj600/kj600/module_hook.py | 4 ++-- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py index c981d47269c..4d10b6dbb30 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_analyse.py @@ -56,7 +56,7 @@ class AnomalyDataWriter: return anomalies_json @staticmethod - def write_data_in_single_json(json_path, anomalies_data): + def update_data_in_single_json(json_path, anomalies_data): with FileOpen(json_path, "w+") as f: fcntl.flock(f, fcntl.LOCK_EX) json.dump(anomalies_data, f, indent=1) @@ -83,6 +83,7 @@ class AnomalyDataWriter: self.json_path, FileCheckConst.FILE, FileCheckConst.WRITE_ABLE ) file_check.common_check() + print_warn_log(f"The existing file will be deleted: {self.json_path}.") os.remove(self.json_path) Path(self.json_path).touch() change_mode(self.json_path, FileCheckConst.DATA_FILE_AUTHORITY) @@ -103,7 +104,7 @@ class AnomalyDataWriter: else: data_to_write = {} data_to_write.update(anomalies_json) - self.write_data_in_single_json(self.json_path, data_to_write) + self.update_data_in_single_json(self.json_path, data_to_write) class AnomalyDataLoader: @@ -137,10 +138,6 @@ class AnomalyDataLoader: json_path = os.path.join(rank_path, ANOMALY_JSON) if not os.path.exists(json_path): continue - file_check = FileChecker( - json_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE - ) - file_check.common_check() with FileOpen(json_path, "r+") as f: fcntl.flock(f, fcntl.LOCK_EX) data_anomalies = json.load(f) @@ -187,15 +184,16 @@ class AnomalyAnalyse: json_path, FileCheckConst.FILE, FileCheckConst.WRITE_ABLE ) file_check.common_check() + print_warn_log(f"The existing file will be deleted: {json_path}.") os.remove(json_path) Path(json_path).touch() change_mode(json_path, FileCheckConst.DATA_FILE_AUTHORITY) - AnomalyDataWriter.write_data_in_single_json(json_path, sorted_data) + AnomalyDataWriter.update_data_in_single_json(json_path, sorted_data) def _get_parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("-d", "--data_path", dest="data_path_dir", default="", type=str, + parser.add_argument("-d", "--data_path", dest="data_path_dir", default="./", type=str, help=" The anomaly detect result dictionary: generate from kj600 tool.", required=True, ) diff --git a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py b/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py index edd5829d50a..49e81ec5a57 100644 --- a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py +++ b/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py @@ -106,13 +106,13 @@ class ApiRegistry: dist.Work.wait = wrapped_wait(dist.Work) -def check_process_group(process_group): - group = None - if isinstance(process_group, dist.ProcessGroup): - group = process_group - if group is None: - group = dist.GroupMember.WORLD # default group - return group +def get_process_group(process_group): + return ( + process_group + if isinstance(process_group, dist.ProcessGroup) + else dist.GroupMember.WORLD + ) + def stack_filter(stack): for pattern in StackBlackList: diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 9e11a337a06..c9b1c541020 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -15,7 +15,7 @@ from kj600.anomaly_detect import AnomalyScanner, AnomalyDataFactory, SummaryWrit from kj600.anomaly_inform import AnomalyInformFactory from kj600.anomaly_analyse import AnomalyDataWriter from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name, TensorMetrics -from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, check_process_group +from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, get_process_group from kj600.utils import print_warn_log, print_info_log, get_param_struct @@ -105,7 +105,7 @@ class TrainerMon: self.optimizer_context = defaultdict(OptimizerContext) self.cc_context = defaultdict(CommunicationContext) self.grad_context = defaultdict(GradContext) - self.process_group = check_process_group(process_group) + self.process_group = get_process_group(process_group) self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) self.module_rank_list = self.config.get("module_ranks", []) -- Gitee From 10cd711b3870f8b56b174744e2c058d2cb37e40a Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 8 Aug 2024 01:20:25 +0000 Subject: [PATCH 43/94] add mean metirc in TensorMetric --- debug/accuracy_tools/kj600/kj600/module_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 456776884af..42bec259615 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -28,7 +28,7 @@ class TensorMetrics: self.metrics = {} #tensor_tag --> [] self.cur_idx = {} - fun_map = {"norm": get_norm, "max": get_max, "min": get_min} + fun_map = {"norm": get_norm, "max": get_max, "min": get_min, "mean": get_mean} #get stats and insert into metrics dictionary def stat_insert(self, tensor, stat_ops, module_name, tensor_name, rank, eps=1e-8): prefix = get_summary_writer_tag_name(module_name, tensor_name, rank) -- Gitee From 1962e9453e8c91e99043d3e24beae6ff37a06888 Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 8 Aug 2024 01:20:47 +0000 Subject: [PATCH 44/94] add close for csvwriter --- debug/accuracy_tools/kj600/kj600/anomaly_detect.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 5eb28fc79f7..0313af84bad 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -115,6 +115,9 @@ class CSVWriterWithAD(BaseWriterWithAD): name = tag.split('/')[0] self.context_dict[name].append(scalar_value) + def close(self): + pass + class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False): super().__init__(path) -- Gitee From 6c7b0062424f350458d4b70794c0f84c96e86045 Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 8 Aug 2024 05:42:51 +0000 Subject: [PATCH 45/94] fix bug in writer --- debug/accuracy_tools/kj600/kj600/anomaly_detect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 0313af84bad..26fd0a2535b 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -110,7 +110,7 @@ class CSVWriterWithAD(BaseWriterWithAD): self.context_dict = defaultdict(list) def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): - super()._add_scalar(tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False) + super()._add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) name = tag.split('/')[0] self.context_dict[name].append(scalar_value) @@ -127,6 +127,6 @@ class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD): self.anomaly_inform = anomaly_inform def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): - super()._add_scalar(tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False) + super()._add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) \ No newline at end of file -- Gitee From 2c61b61cd2e39387317c3a10b4b6275d748dd4e9 Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 8 Aug 2024 05:51:52 +0000 Subject: [PATCH 46/94] seperate actvgrad --- .../accuracy_tools/kj600/kj600/module_hook.py | 49 +++++++++++-------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index d32a6f8cb15..7f9c45c925c 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -86,14 +86,17 @@ class GradContext: self.post = [] self.acc_metric = [] self.acc = {} - self.actv = [] + self.actv = defaultdict(dict) + self.tensors = [] def reset(self): self.pre.clear() self.post.clear() self.acc_metric.clear() + self.actv.clear() for k,v in self.acc.items(): v.fill_(0.) + self.tensors.clear() @@ -178,7 +181,7 @@ class TrainerMon: self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.micro_batch_number = 1 self.step = -1 - self.rank = dist.get_rank() + self.rank = dist.get_rank() if dist.is_initialized() else None self.weight_hooked = False self.optimizer_hooked = False @@ -232,14 +235,14 @@ class TrainerMon: targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config['targets'].keys() hooked_count = self._hook_module(targets, model_chunk, vpp_stage) print_rank_0(f"> {hooked_count} out of {len(self.config['targets'])} are monitored.") - - if not self.optimizer_hooked: print_rank_0("> parameter names:") - for name, param in model.named_parameters(): + for name, param in model_chunk.named_parameters(): print_rank_0(f"\t{name}") for target_module, _ in self.config['targets'].items(): if name.startswith(target_module) and param.requires_grad: # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 self.param2name[param] = name + + if not self.optimizer_hooked: self.hook_optimizer() return @@ -261,7 +264,6 @@ class TrainerMon: def monitor_gnorm_with_ad(self, model, grad_acc_steps): self.hook_optimizer() self.micro_batch_number = grad_acc_steps - self.wg_distribution = True self.backward_only = True self._register_param_name(model) @@ -319,10 +321,12 @@ class TrainerMon: # print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") # self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') # bwd_context.actvgrad.clear() - if not len(self.grad_context.actv) == self.micro_batch_number: - print_warn_log(f"grad_context.actv not equal to micro_batch_number: {len(self.grad_context.actv)}, {self.micro_batch_number}") - self.write_metrics(self.ops, self.summary_writer, self.grad_context.actv, step, 'grad_actv') - self.grad_context.actv.clear() + for tensor in self.grad_context.tensors: + for metric_name in self.ops: + self.grad_context.actv[metric_name].update(get_metrics(metric_name, tensor, self.eps)) + self.grad_context.tensors.clear() + + self.write_metrics(self.ops, self.summary_writer, [self.grad_context.actv], step, 'grad_actv') def write_grad_tb(self, step): if not self.wg_distribution: @@ -333,7 +337,6 @@ class TrainerMon: if self.weight_hooked: self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_accumulated') - self.grad_context.reset() def hook_optimizer(self): # in DDP by default use params_have_main_grad @@ -418,6 +421,7 @@ class TrainerMon: self.write_metrics(self.ops, self.summary_writer, context.metric_list, context.step, 'other') context.metric_list.clear() context.step += 1 + self.grad_context.reset() return @@ -529,22 +533,25 @@ class TrainerMon: tbtag_tensor_map = {} if not context.ignore_in: cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col] - tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'input_grad', cared_input_grad)) + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name+f'_{context.micro_step}', f'input_grad', cared_input_grad)) cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col] - tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'output_grad', cared_output_grad)) - metric_dict = {} - for metric_name in self.ops: - metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name+f'_{context.micro_step}', f'output_grad', cared_output_grad)) + if context.micro_step == 0 and context.actvgrad: print_warn_log(f"actvgrad context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.") context.actvgrad.clear() # context.actvgrad.append(metric_dict) - if len(self.grad_context.actv) == context.micro_step: - self.grad_context.actv.append({metric_name:{} for metric_name in self.ops}) - for metric_name in self.ops: - self.grad_context.actv[context.micro_step][metric_name].update(metric_dict[metric_name]) - + self.grad_context.tensors.append(tbtag_tensor_map) + + # calcularte the metric of last micro step + if len(self.grad_context.tensors) == 200: + for tensor in self.grad_context.tensors: + for metric_name in self.ops: + self.grad_context.actv[metric_name].update(get_metrics(metric_name, tensor, self.eps)) + + self.grad_context.tensors.clear() + context.micro_step += 1 if context.micro_step == self.micro_batch_number: context.micro_step = 0 -- Gitee From a2700d9433d20c1d2d385d2abb404df48404ee3f Mon Sep 17 00:00:00 2001 From: wugengjun <451676383@qq.com> Date: Mon, 5 Aug 2024 21:14:08 +0800 Subject: [PATCH 47/94] =?UTF-8?q?=E8=A1=A5=E5=85=85=E5=8F=AF=E8=A7=86?= =?UTF-8?q?=E5=8C=96ut=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../builder/test_graph_builder.py | 52 +++++++++++++ .../builder/test_msprobe_adapter.py | 73 +++++++++++++++++++ .../compare/test_graph_comparator.py | 32 ++++++++ .../compare/test_mode_adapter.py | 61 ++++++++++++++++ .../visualization/graph/test_base_node.py | 64 ++++++++++++++++ .../visualization/graph/test_graph.py | 50 +++++++++++++ .../visualization/graph/test_node_op.py | 28 +++++++ 7 files changed, 360 insertions(+) create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_graph_builder.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_msprobe_adapter.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_graph_comparator.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_mode_adapter.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_base_node.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_graph.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_node_op.py diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_graph_builder.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_graph_builder.py new file mode 100644 index 00000000000..66eceea4b2a --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_graph_builder.py @@ -0,0 +1,52 @@ +import unittest +from unittest.mock import MagicMock, patch +from msprobe.pytorch.visualization.builder.graph_builder import GraphBuilder, Graph + + +class TestGraphBuilder(unittest.TestCase): + + def setUp(self): + self.construct_path = "step/rank/construct.json" + self.data_path = "step/rank/dump.json" + self.model_name = "TestModel" + self.graph = Graph(self.model_name) + self.construct_dict = { + "Tensor1": "Module1", + "Module1": None + } + self.data_dict = { + "Module1": {"data": "data for Module1"}, + "Tensor1": {"data": "data for Tensor1"} + } + + @patch('msprobe.pytorch.visualization.builder.graph_builder.load_json_file') + @patch('msprobe.pytorch.visualization.builder.graph_builder.load_data_json_file') + def test_build(self, mock_load_data_json_file, mock_load_json_file): + mock_load_data_json_file.return_value = self.data_dict + mock_load_json_file.return_value = self.construct_dict + + graph = GraphBuilder.build(self.construct_path, self.data_path, self.model_name) + self.assertIsNotNone(graph) + self.assertIsInstance(graph, Graph) + self.assertEqual(len(graph.node_map), 3) + + @patch('msprobe.pytorch.visualization.builder.graph_builder.save_json_file') + def test_to_json(self, mock_save_json_file): + GraphBuilder.to_json("step/rank/output.vis", self.graph) + mock_save_json_file.assert_called_once() + + @patch('msprobe.pytorch.visualization.graph.node_op.NodeOp.get_node_op') + @patch('msprobe.pytorch.visualization.builder.msprobe_adapter.get_input_output', return_value=([], [])) + def test__init_nodes(self, mock_get_input_output, mock_get_node_op): + GraphBuilder._init_nodes(self.graph, self.construct_dict, self.data_dict) + mock_get_node_op.assert_any_call("Tensor1") + mock_get_node_op.assert_any_call("Module1") + self.assertIs(self.graph.root, self.graph.get_node("TestModel")) + + def test__create_or_get_node(self): + node_op = MagicMock() + data_dict = {"node1": {}} + node = GraphBuilder._create_or_get_node(self.graph, data_dict, node_op, "node1") + self.assertIn("node1", self.graph.node_map) + self.assertEqual(node.input_data, {}) + self.assertEqual(node.output_data, {}) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_msprobe_adapter.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_msprobe_adapter.py new file mode 100644 index 00000000000..12ae24279fd --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/builder/test_msprobe_adapter.py @@ -0,0 +1,73 @@ +import unittest +from unittest.mock import patch +from msprobe.pytorch.visualization.builder.msprobe_adapter import ( + get_compare_mode, + run_real_data, + get_input_output, + compare_data, + format_node_data, + compare_node, + _format_decimal_string, + _format_data +) +from msprobe.pytorch.visualization.utils import GraphConst + + +class TestMsprobeAdapter(unittest.TestCase): + @patch('msprobe.pytorch.visualization.builder.msprobe_adapter.task_dumppath_get', return_value=(True, False)) + def test_get_compare_mode_summary(self, mock_task_dumppath_get): + mode = get_compare_mode("dummy_param") + self.assertEqual(mode, GraphConst.SUMMARY_COMPARE) + + @patch('msprobe.pytorch.visualization.builder.msprobe_adapter._do_multi_process') + def test_run_real_data(self, mock_do_multi_process): + run_real_data("dump_path", "csv_path") + mock_do_multi_process.assert_called_once_with("dump_path", "csv_path") + + def test_get_input_output(self): + node_data = { + 'input_args': [{'type': 'torch.Tensor', 'dtype': 'torch.int64', 'shape': [5], + 'Max': 2049.0, 'Min': 0.0, 'Mean': 410.20001220703125, 'Norm': 2049.0009765625, + 'requires_grad': False, 'full_op_name': 'Distributed.broadcast.0.forward_input.0'}, + {'type': 'int', 'value': 0}], + 'input_kwargs': {'group': None}, + 'output': [{'type': 'torch.Tensor', 'dtype': 'torch.int64', 'shape': [5], + 'Max': 2049.0, 'Min': 0.0, 'Mean': 410.20001220703125, 'Norm': 2049.0009765625, + 'requires_grad': False, 'full_op_name': 'Distributed.broadcast.0.forward_output.0'}, + {'type': 'int', 'value': 0}, None] + } + node_id = "Distributed.broadcast.0.forward" + input_data, output_data = get_input_output(node_data, node_id) + self.assertIn("Distributed.broadcast.0.forward_output.0", output_data) + self.assertIn("Distributed.broadcast.0.forward_input.0", input_data) + + def test_compare_data(self): + data_dict_list1 = {'key1': {'type': 'Type1', 'dtype': 'DType1', 'shape': 'Shape1'}} + data_dict_list2 = {'key1': {'type': 'Type1', 'dtype': 'DType1', 'shape': 'Shape1'}} + self.assertTrue(compare_data(data_dict_list1, data_dict_list2)) + + def test_format_node_data(self): + data_dict = {'node1': {'data_name': 'data1', 'full_op_name': 'op1'}} + result = format_node_data(data_dict) + self.assertNotIn('data_name', result['node1']) + self.assertNotIn('requires_grad', result['node1']) + + @patch('msprobe.pytorch.visualization.builder.msprobe_adapter.get_accuracy') + def test_compare_node(self, mock_get_accuracy): + node_ids = ["node1", "node2"] + data_dicts = [{'node1': {"input_args": [], "input_kwargs": {}, "output": {}}}, + {'node2': {"input_args": [], "input_kwargs": {}, "output": {}}}] + stack_json_data = {} + result = compare_node(node_ids, data_dicts, stack_json_data, False, False) + mock_get_accuracy.assert_called_once() + self.assertIsInstance(result, list) + + def test__format_decimal_string(self): + s = "0.123456789%" + formatted_s = _format_decimal_string(s) + self.assertIn("0.123457%", formatted_s) + + def test__format_data(self): + data_dict = {'value': 0.123456789} + _format_data(data_dict) + self.assertEqual(data_dict['value'], '0.123457') \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_graph_comparator.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_graph_comparator.py new file mode 100644 index 00000000000..bece5380f04 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_graph_comparator.py @@ -0,0 +1,32 @@ +import unittest +from unittest.mock import patch +from msprobe.pytorch.visualization.compare.graph_comparator import GraphComparator +from msprobe.pytorch.visualization.graph.graph import Graph +from msprobe.pytorch.visualization.utils import GraphConst + + +class TestGraphComparator(unittest.TestCase): + + def setUp(self): + self.graphs = [Graph("model1"), Graph("model2")] + self.data_paths = ["step1/rank/dump.json", "step2/rank/dump.json"] + self.stack_path = "step1/rank/stack.json" + self.output_path = "output/output.vis" + + @patch('msprobe.pytorch.visualization.compare.graph_comparator.get_compare_mode') + @patch('msprobe.pytorch.visualization.compare.graph_comparator.load_json_file') + @patch('msprobe.pytorch.visualization.compare.graph_comparator.load_data_json_file') + def test__parse_param(self, mock_load_data_json_file, mock_load_json_file, mock_get_compare_mode): + mock_load_data_json_file.return_value = "data_dict" + mock_load_json_file.return_value = "construct_dict" + mock_get_compare_mode.return_value = GraphConst.SUMMARY_COMPARE + self.comparator = GraphComparator(self.graphs, self.data_paths, self.stack_path, self.output_path) + self.comparator._parse_param(self.data_paths, self.stack_path, self.output_path) + + self.assertEqual(self.comparator.dump_path_param, { + 'npu_json_path': self.data_paths[0], + 'bench_json_path': self.data_paths[1], + 'stack_json_path': self.stack_path, + 'is_print_compare_log': True + }) + self.assertEqual(self.comparator.output_path, self.output_path) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_mode_adapter.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_mode_adapter.py new file mode 100644 index 00000000000..7883a09a341 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/compare/test_mode_adapter.py @@ -0,0 +1,61 @@ +import unittest +from unittest.mock import patch, MagicMock +from msprobe.pytorch.visualization.compare.mode_adapter import ModeAdapter +from msprobe.pytorch.visualization.graph.base_node import BaseNode, NodeOp +from msprobe.pytorch.visualization.utils import GraphConst, ToolTip +from msprobe.core.common.const import CompareConst + + +class TestModeAdapter(unittest.TestCase): + + def setUp(self): + self.node_op = NodeOp.module + self.node_id = "node_1" + self.node = BaseNode(self.node_op, self.node_id) + self.compare_mode = GraphConst.REAL_DATA_COMPARE + self.adapter = ModeAdapter(self.compare_mode) + self.compare_data_dict = [{}, {}] + + def test_add_md5_compare_data(self): + node_data = {'md5_key': 'some_md5_value'} + compare_data_dict = {'md5_key': 'expected_md5_value'} + precision_status = ModeAdapter._add_md5_compare_data(node_data, compare_data_dict) + self.assertTrue(precision_status) + + @patch('msprobe.pytorch.visualization.compare.mode_adapter.ModeAdapter') + def test_parse_result(self, mock_mode_adapter): + mock_mode_adapter._add_summary_compare_data.return_value = (True, 0.5) + self.adapter.compare_mode = GraphConst.SUMMARY_COMPARE + precision_status, precision_index, other_dict = self.adapter.parse_result( + self.node, self.compare_data_dict) + self.assertEqual(precision_status, True) + self.assertEqual(precision_index, 0.5) + self.assertEqual(other_dict, {}) + + def test_prepare_real_data(self): + self.adapter.is_real_data_compare = MagicMock(return_value=True) + result = self.adapter.prepare_real_data(self.node) + self.assertTrue(result) + + def test_compare_mode_methods(self): + self.adapter.compare_mode = GraphConst.SUMMARY_COMPARE + self.assertTrue(self.adapter.is_summary_compare()) + self.assertFalse(self.adapter.is_md5_compare()) + self.assertFalse(self.adapter.is_real_data_compare()) + + def test_add_csv_data(self): + compare_result_list = ['result1', 'result2'] + self.adapter.add_csv_data(compare_result_list) + self.assertEqual(self.adapter.csv_data, compare_result_list) + + def test_add_error_key(self): + node_data = {'key': {}} + self.adapter.compare_mode = GraphConst.REAL_DATA_COMPARE + self.adapter.add_error_key(node_data) + self.assertEqual(node_data['key'][GraphConst.ERROR_KEY], + [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO]) + + def test_get_tool_tip(self): + self.adapter.compare_mode = GraphConst.MD5_COMPARE + tips = self.adapter.get_tool_tip() + self.assertEqual(tips, {'md5': ToolTip.MD5}) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_base_node.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_base_node.py new file mode 100644 index 00000000000..544950f3588 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_base_node.py @@ -0,0 +1,64 @@ +import unittest +from msprobe.pytorch.visualization.graph.base_node import BaseNode, NodeOp +from msprobe.pytorch.visualization.utils import GraphConst + + +class TestBaseNode(unittest.TestCase): + + def setUp(self): + self.node_op = NodeOp.module + self.node_id = "node_1" + self.up_node = BaseNode(self.node_op, "up_node_1") + self.node = BaseNode(self.node_op, self.node_id, self.up_node) + + def test_init_and_str(self): + self.assertEqual(self.node.op, self.node_op) + self.assertEqual(self.node.id, self.node_id) + self.assertEqual(str(self.node), 'id:\tnode_1') + + def test_eq(self): + other_node = BaseNode(self.node_op, self.node_id, self.up_node) + self.assertEqual(self.node, other_node) + + def test_get_suggestions(self): + self.node.get_suggestions() + self.assertIn(GraphConst.SUGGEST_KEY, self.node.suggestions) + + def test_set_input_output(self): + input_data = {'input1': 'value1'} + output_data = {'output1': 'value2'} + self.node.set_input_output(input_data, output_data) + self.assertEqual(self.node.input_data, input_data) + self.assertEqual(self.node.output_data, output_data) + + def test_add_upnode(self): + self.node = BaseNode(self.node_op, self.node_id) + new_up_node = BaseNode(self.node_op, "new_up_node_1") + self.node.add_upnode(new_up_node) + self.assertEqual(self.node.upnode, new_up_node) + self.assertIn(self.node, new_up_node.subnodes) + + def test_add_link(self): + other_node = BaseNode(self.node_op, "other_node_1") + ancestors = ['a1', 'a2'] + self.node.add_link(other_node, ancestors) + self.assertEqual(self.node.matched_node_link, ancestors) + self.assertEqual(other_node.matched_node_link, ancestors) + + def test_to_dict(self): + expected_result = { + 'id': self.node_id, + 'node_type': self.node_op.value, + 'data': {}, + 'output_data': {}, + 'input_data': {}, + 'upnode': self.up_node.id, + 'subnodes': [], + 'matched_node_link': [], + 'suggestions': {} + } + self.assertEqual(self.node.to_dict(), expected_result) + + def test_get_ancestors(self): + expected_ancestors = ['up_node_1'] + self.assertEqual(self.node.get_ancestors(), expected_ancestors) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_graph.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_graph.py new file mode 100644 index 00000000000..19d09874345 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_graph.py @@ -0,0 +1,50 @@ +import unittest +from msprobe.pytorch.visualization.graph.graph import Graph, NodeOp +from msprobe.pytorch.visualization.graph.base_node import BaseNode +from msprobe.pytorch.visualization.utils import GraphConst + + +class TestGraph(unittest.TestCase): + + def setUp(self): + self.graph = Graph("model_name") + self.node_id = "node_id" + self.node_op = NodeOp.module + + def test_add_node_and_get_node(self): + self.graph.add_node(self.node_op, self.node_id) + node = self.graph.get_node(self.node_id) + self.assertIsNotNone(node) + self.assertIn(self.node_id, self.graph.node_map) + + def test_to_dict(self): + self.graph.add_node(self.node_op, self.node_id) + result = self.graph.to_dict() + self.assertEqual(result[GraphConst.JSON_ROOT_KEY], "model_name") + self.assertIn(self.node_id, result[GraphConst.JSON_NODE_KEY]) + + def test_str(self): + self.graph.add_node(self.node_op, self.node_id) + expected_str = f'{self.node_id}' + self.assertIn(expected_str, str(self.graph)) + + def test_match(self): + graph_a = Graph("model_name_a") + graph_b = Graph("model_name_b") + node_a = BaseNode(self.node_op, self.node_id) + graph_a.add_node(NodeOp.module, "node_id_a") + graph_b.add_node(NodeOp.module, "node_id_b") + matched_node, ancestors = Graph.match(graph_a, node_a, graph_b) + self.assertIsNone(matched_node) + self.assertEqual(ancestors, []) + + def test_dfs(self): + graph = Graph("model_name") + graph.add_node(NodeOp.module, "node_a") + graph.add_node(NodeOp.module, "node_b") + node_a = BaseNode(self.node_op, self.node_id) + result = {} + graph.dfs(node_a, result) + self.assertEqual(result, {'node_id': {'id': 'node_id', 'node_type': 0, 'data': {}, + 'output_data': {}, 'input_data': {}, 'upnode': 'None', 'subnodes': [], + 'matched_node_link': [], 'suggestions': {}}}) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_node_op.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_node_op.py new file mode 100644 index 00000000000..1a340ac8b3c --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/visualization/graph/test_node_op.py @@ -0,0 +1,28 @@ +import unittest +from msprobe.pytorch.visualization.graph.node_op import NodeOp + + +class TestNodeOp(unittest.TestCase): + + def test_get_node_op_valid(self): + node_name = "ModuleTest" + self.assertEqual(NodeOp.get_node_op(node_name), NodeOp.module) + + def test_get_node_op_invalid(self): + node_name = "InvalidNodeName" + with self.assertRaises(Exception): + NodeOp.get_node_op(node_name) + + def test_get_node_op_all(self): + test_cases = [ + ("ModuleTest", NodeOp.module), + ("TensorTest", NodeOp.function_api), + ("TorchTest", NodeOp.function_api), + ("FunctionalTest", NodeOp.function_api), + ("NPUTest", NodeOp.function_api), + ("VFTest", NodeOp.function_api), + ("DistributedTest", NodeOp.function_api), + ("AtenTest", NodeOp.function_api) + ] + for node_name, expected_op in test_cases: + self.assertEqual(NodeOp.get_node_op(node_name), expected_op) -- Gitee From c1fd56267a4545f326be27523765e0b981a099a9 Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 9 Aug 2024 02:05:15 +0000 Subject: [PATCH 48/94] test grad monitor --- .../kj600/kj600/unittest/cc_utils.py | 83 ++++++++++++ .../kj600/kj600/unittest/test_grad_monitor.py | 122 ++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py create mode 100644 debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py diff --git a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py b/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py new file mode 100644 index 00000000000..7588d131d7e --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py @@ -0,0 +1,83 @@ +import os +from functools import partial +import torch +from torch import distributed as dist +from torch import nn +try: + import torch_npu + BACKEND = 'hccl' + DEVICE = 'npu' +except: + BACKEND = 'nccl' + DEVICE = 'cuda' + +from kj600.features import square_sum, get_max, get_min, get_zeros +from kj600.module_hook import CommunicationContext + + +OP_FUNCS = { + "min": get_min, + "max": get_max, + "norm": square_sum, + "zeros": partial(get_zeros, eps=1e-8) +} + +def ddp_setup(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12345" + dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) + +def reset_context(context): + if isinstance(context, CommunicationContext): + context.reset() + elif isinstance(context, dict): + for op, v in context.items(): + v.reset() + +def wrap_reset(func): + def reset_and_test(*args, **kwargs): + print(f"testing {func.__name__}") + reset_context(args[0]) + res = func(*args, **kwargs) + return res + + return reset_and_test + +def assert_empty(data): + assert len(data) == 0, f'data is not empty as expected' + +def assert_nonempty(data): + assert len(data) != 0, f'data is empty' + +def assert_equal(a, b, rank, op_name=None, tag=None): + if a.dim() == 0: + assert a==b, f'inequal in rank {rank}: {a}, {b}, {op_name}, {tag}' + else: + assert torch.equal(a,b), f'inequal in rank {rank}: {a},{b}' + +def assert_inequal(a, b, rank): + if a.dim() == 0: + assert a!=b, f'equal in rank {rank}: {a},{b}' + else: + assert not torch.equal(a,b), f'equal in rank {rank}: {a},{b}' + +def assert_context(data, src, rank): + if len(src) == 0: + assert_empty(data) + else: + assert_nonempty(data) + + for op_name, tensors in data.items(): + for tag, tensor in tensors.items(): + prefix, idx = tag.split('_') + idx = int(idx) + assert_equal(tensor, OP_FUNCS[op_name](src[prefix][idx]), rank, op_name, tag) + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + self.layer = nn.Linear(2,2) + + def forward(self, x): + return self.layer(x) \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py new file mode 100644 index 00000000000..c99d665f811 --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -0,0 +1,122 @@ +import unittest +import shutil +import torch +import json +import os +import pandas as pd +import torch.multiprocessing as mp +from torch.utils.data import DistributedSampler, DataLoader, Dataset +from torch.nn.parallel import DistributedDataParallel as DDP +import torch.distributed as dist + +from kj600.module_hook import TrainerMon +from kj600.unittest.cc_utils import ddp_setup, BACKEND +from tensorboard.backend.event_processing.event_accumulator import EventAccumulator + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.fc = torch.nn.Linear(2, 2, bias=False) + self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.relu(self.fc(x)) + +class ToyDataset(Dataset): + def __init__(self): + self.data = torch.randn(32, 2, requires_grad=True) + self.labels = torch.randint(low=0, high=2, size=(32,)) + def __len__(self): + return len(self.labels) + def __getitem__(self, idx): + return self.data[idx], self.labels[idx] + +def get_config(): + os.environ["KJ600_OUTPUT_DIR"] = "./test_kj600_output" + with open(os.path.join(os.path.dirname(__file__),"config_basic_functions.json"), 'r') as file: + config_test = json.load(file) + return config_test + +def get_grad(model, res): + for name, parameter in model.named_parameters(): + if name not in res: + res[name] = [] + res[name].append(parameter.grad.norm()) + +def train(rank, world_size): + ddp_setup(rank, world_size) + + tp_groups = [[0,1],[2,3]] + dp_groups = [[0,2],[1,3]] + + for pg in tp_groups: + if rank in pg: + tp_group = dist.new_group(pg, backend=BACKEND) + for pg in dp_groups: + if rank in pg: + dp_group = dist.new_group(pg, backend=BACKEND) + + device = f'cuda:{rank}' + mbs = 2 + gbs = 16 + dp = world_size + grad_acc = gbs/mbs/dp + + model = Model().to(device=device) + hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_basic_functions.json"), False) # or opt_ty=Megatron_DistributedOptimizer + hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc) + + train_ds = ToyDataset() + sampler = DistributedSampler(train_ds, num_replicas=world_size, rank=rank) + train_loader = DataLoader(train_ds, batch_size=mbs, sampler=sampler) + + optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001) + unreduced = {} + reduced = {} + + # for param in model.parameters(): + # param.data = param.data.half() + micro_step = 0 + optimizer.zero_grad() + for (inputs, targets) in train_loader: + micro_step += 1 + + inputs = inputs.to(device) + targets = targets.to(device) + # inputs and param torch.float32 -> torch.float16 + # inputs = inputs.half() + + # outputs torch.float32 + outputs = model(inputs) + # loss torch.float16 -> torch.float32 + loss = torch.nn.functional.cross_entropy(outputs, targets) + + loss.backward() + + if micro_step == grad_acc: + micro_step = 0 + get_grad(model, unreduced) + + for parameter in model.parameters(): + dist.all_reduce(parameter.grad, group=dp_group) + + get_grad(model, reduced) + optimizer.step() + optimizer.zero_grad() + + + + print(unreduced) + print(reduced) + +def check_grad(reduced, unreduced): + pass + + + +if __name__ == "__main__": + world_size=4 + torch.manual_seed(1234) + mp.spawn(train, args=(world_size,), nprocs=world_size) + + -- Gitee From ebc790c67c832b568f6e5679833d3bb38dd0f896 Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 9 Aug 2024 02:06:52 +0000 Subject: [PATCH 49/94] add config --- .../kj600/unittest/config_grad_monitor.json | 18 ++++++++++++++++++ .../kj600/kj600/unittest/test_grad_monitor.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json b/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json new file mode 100644 index 00000000000..7f5096bec90 --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json @@ -0,0 +1,18 @@ +{ + "targets": { + "fc": { + "input": "tuple[1]:0", + "output": "tensor", + "input_grad": "tuple[1]:0", + "output_grad": "tuple[1]:0" + } + }, + "wg_distribution": true, + "eps": 1e-8, + "format": "csv", + "ops": [ + "min", + "max", + "norm" + ] +} \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index c99d665f811..08250b7971c 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -63,7 +63,7 @@ def train(rank, world_size): grad_acc = gbs/mbs/dp model = Model().to(device=device) - hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_basic_functions.json"), False) # or opt_ty=Megatron_DistributedOptimizer + hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), False) # or opt_ty=Megatron_DistributedOptimizer hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc) train_ds = ToyDataset() -- Gitee From 821a2403cbc1b829f8e0812a1aab765a901cb0cc Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 9 Aug 2024 02:54:14 +0000 Subject: [PATCH 50/94] add tp ut --- .../kj600/kj600/unittest/test_grad_monitor.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index 08250b7971c..484d8cb620e 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -97,21 +97,39 @@ def train(rank, world_size): micro_step = 0 get_grad(model, unreduced) + works = [] for parameter in model.parameters(): - dist.all_reduce(parameter.grad, group=dp_group) + works.append(dist.all_reduce(parameter.grad, group=dp_group,async_op=True)) + + for work in works: + work.wait() + + total_norm = 0. + for parameter in model.parameters(): + total_norm += parameter.grad.norm()**2 + + # work = dist.all_reduce(total_norm, group = tp_group, async_op=True) + # work.wait() + + total_norm = total_norm ** 0.5 get_grad(model, reduced) optimizer.step() optimizer.zero_grad() - - print(unreduced) + print(f"rank {rank} | total_norm {total_norm.item()}") + # print(unreduced) print(reduced) +def get_csv_file(rank): + pass + def check_grad(reduced, unreduced): pass +def check_total_norm(total_norm): + pass if __name__ == "__main__": -- Gitee From 659334a47346b09415a0b71d28edecc9e1d716f7 Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 9 Aug 2024 09:30:10 +0000 Subject: [PATCH 51/94] make attr consist --- debug/accuracy_tools/kj600/kj600/anomaly_detect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 26fd0a2535b..563fba39e61 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -86,7 +86,7 @@ class BaseWriterWithAD: class CSVWriterWithAD(BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False): - self.path = path + self.log_dir = path create_directory(path) self.tag2scalars = defaultdict(list) self.ad_rules = ad_rules @@ -98,7 +98,7 @@ class CSVWriterWithAD(BaseWriterWithAD): def write_csv(self, prefix, step): if len(self.context_dict) == 0: return - filepath = os.path.join(self.path, f'{prefix}_{step}.csv') + filepath = os.path.join(self.log_dir, f'{prefix}_{step}.csv') if not os.path.exists(filepath): make_file_safety(filepath) data_frame = pd.DataFrame(columns=self.header) -- Gitee From 74ad01595cbcd787e8ff65f96776ffd89ac5bedb Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 9 Aug 2024 09:30:53 +0000 Subject: [PATCH 52/94] clean --- debug/accuracy_tools/kj600/kj600/module_hook.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 7f9c45c925c..862f126e1e8 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -21,9 +21,8 @@ from kj600.utils import print_warn_log, print_info_log, get_param_struct try: import torch_npu - DEVICE = 'npu' except ImportError: - DEVICE= 'cuda' + pass class ModuleHookContext: def __init__(self, module_name) -> None: @@ -187,7 +186,6 @@ class TrainerMon: self.optimizer_hooked = False self.param2name = defaultdict(str) - self.metric_stream = torch.cuda.Stream() self.mix_precision_optimizer_mon = OptimizerMonFactory.create_optimizer_mon(opt_ty) if opt_ty is None: @@ -268,7 +266,7 @@ class TrainerMon: self._register_param_name(model) self._hook_model_for_grad_acc(model) - # self._hook_weights() + self._hook_weights() self.hook_modules(model, grad_acc_steps) @@ -600,7 +598,7 @@ class TrainerMon: for param, name in self.param2name.items(): key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) - context.acc[key] = torch.zeros_like(param).to(DEVICE) + context.acc[key] = torch.zeros_like(param).to(param.device) param.register_hook(partial(param_hook, grad_acc=context.acc[key])) self.weight_hooked = True -- Gitee From e906828f48cfb70238e3dce2495baf269fe772d2 Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 9 Aug 2024 09:31:23 +0000 Subject: [PATCH 53/94] update wg monitor test --- .../kj600/kj600/unittest/cc_utils.py | 4 +- .../kj600/kj600/unittest/test_grad_monitor.py | 114 +++++++++++------- 2 files changed, 75 insertions(+), 43 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py b/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py index 7588d131d7e..10c742e7b54 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py @@ -23,8 +23,8 @@ OP_FUNCS = { } def ddp_setup(rank, world_size): - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "12345" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = "6006" dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) def reset_context(context): diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index 484d8cb620e..d5035380159 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -1,5 +1,3 @@ -import unittest -import shutil import torch import json import os @@ -8,10 +6,10 @@ import torch.multiprocessing as mp from torch.utils.data import DistributedSampler, DataLoader, Dataset from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist +import torch_npu from kj600.module_hook import TrainerMon from kj600.unittest.cc_utils import ddp_setup, BACKEND -from tensorboard.backend.event_processing.event_accumulator import EventAccumulator class Model(torch.nn.Module): def __init__(self): @@ -29,7 +27,7 @@ class ToyDataset(Dataset): def __len__(self): return len(self.labels) def __getitem__(self, idx): - return self.data[idx], self.labels[idx] + return idx, self.data[idx], self.labels[idx] def get_config(): os.environ["KJ600_OUTPUT_DIR"] = "./test_kj600_output" @@ -43,52 +41,73 @@ def get_grad(model, res): res[name] = [] res[name].append(parameter.grad.norm()) +def init_parallel(model_parallel, data_parallel): + rank = dist.get_rank() + world_size = dist.get_world_size() + + mp_group = None + dp_group = None + + num_mp_group = world_size // model_parallel + num_dp_group = world_size // data_parallel + + for i in range(num_mp_group): + start = i * model_parallel + end = (i+1) * model_parallel + + ranks = range(start, end) + if rank in ranks: + mp_group = dist.new_group(ranks) + # mp_group = list(ranks) + + for j in range(num_dp_group): + ranks = range(j, model_parallel*data_parallel, model_parallel) + if rank in ranks: + dp_group = dist.new_group(ranks) + # dp_group = list(ranks) + + return dp_group, mp_group + + def train(rank, world_size): ddp_setup(rank, world_size) - tp_groups = [[0,1],[2,3]] - dp_groups = [[0,2],[1,3]] - - for pg in tp_groups: - if rank in pg: - tp_group = dist.new_group(pg, backend=BACKEND) - for pg in dp_groups: - if rank in pg: - dp_group = dist.new_group(pg, backend=BACKEND) - - device = f'cuda:{rank}' + device = f'npu:{rank}' mbs = 2 gbs = 16 - dp = world_size - grad_acc = gbs/mbs/dp + model_parallel = 1 + data_parallel = world_size // model_parallel + grad_acc = gbs/mbs//data_parallel + + dp_group, mp_group = init_parallel(model_parallel, data_parallel) + # print(rank, dp_group, mp_group) model = Model().to(device=device) hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), False) # or opt_ty=Megatron_DistributedOptimizer hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc) train_ds = ToyDataset() - sampler = DistributedSampler(train_ds, num_replicas=world_size, rank=rank) + sampler = DistributedSampler(train_ds, num_replicas=data_parallel, rank=dist.get_group_rank(dp_group, rank)) train_loader = DataLoader(train_ds, batch_size=mbs, sampler=sampler) optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001) unreduced = {} reduced = {} - # for param in model.parameters(): - # param.data = param.data.half() micro_step = 0 + step = 0 optimizer.zero_grad() - for (inputs, targets) in train_loader: + for (idx, inputs, targets) in train_loader: micro_step += 1 inputs = inputs.to(device) targets = targets.to(device) + # print(step, micro_step, idx, rank) # inputs and param torch.float32 -> torch.float16 # inputs = inputs.half() # outputs torch.float32 outputs = model(inputs) - # loss torch.float16 -> torch.float32 loss = torch.nn.functional.cross_entropy(outputs, targets) loss.backward() @@ -97,36 +116,49 @@ def train(rank, world_size): micro_step = 0 get_grad(model, unreduced) - works = [] - for parameter in model.parameters(): - works.append(dist.all_reduce(parameter.grad, group=dp_group,async_op=True)) - - for work in works: - work.wait() + grad_buffer = torch.hstack([p.grad.flatten() for p in model.parameters()]) + dist.all_reduce(grad_buffer, group=dp_group) + start = 0 + for p in model.parameters(): + numel = p.numel() + p.grad = grad_buffer.narrow(0,start,numel).reshape(p.shape) + start += numel + + get_grad(model, reduced) + total_norm = 0. for parameter in model.parameters(): total_norm += parameter.grad.norm()**2 - - # work = dist.all_reduce(total_norm, group = tp_group, async_op=True) - # work.wait() + + # dist.all_reduce(total_norm, group = mp_group) total_norm = total_norm ** 0.5 - get_grad(model, reduced) + print(f"step {step} | rank {rank} | total_norm {total_norm.item()} | loss {loss.item()}") + optimizer.step() optimizer.zero_grad() + step += 1 - print(f"rank {rank} | total_norm {total_norm.item()}") - # print(unreduced) - print(reduced) - -def get_csv_file(rank): - pass - -def check_grad(reduced, unreduced): - pass + reduced_monitor, unreduced_monitor = get_csv_files(hooker.summary_writer.log_dir, step) + check_grad(reduced, unreduced, reduced_monitor, unreduced_monitor, step) + +def assert_info(a, b): + assert round(a,5)==round(b,5), f'{a}, {b}' + +def get_csv_files(logdir, step): + return [pd.read_csv(os.path.join(logdir, f'grad_reduced_{i}.csv')) for i in range(step)], \ + [pd.read_csv(os.path.join(logdir, f'grad_unreduced_{i}.csv')) for i in range(step)] + +def check_grad(reduced, unreduced, reduced_monitor, unreduced_monitor, step): + for i in range(step): + try: + assert_info(unreduced_monitor[i].loc[0,'norm'], unreduced['fc.weight'][i].item()) + assert_info(reduced_monitor[i].loc[0,'norm'], reduced['fc.weight'][i].item()) + except AssertionError as e: + print(f'mismatch in rank {dist.get_rank()} step {i}: (monitored | from training) {e}') def check_total_norm(total_norm): pass -- Gitee From caee64d7451f8a29e916fa600c2b7d2a17b839f4 Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 12 Aug 2024 06:01:49 +0000 Subject: [PATCH 54/94] patch given opt.step --- .../accuracy_tools/kj600/kj600/module_hook.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 862f126e1e8..032baa17754 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -259,14 +259,14 @@ class TrainerMon: return metric_dict - def monitor_gnorm_with_ad(self, model, grad_acc_steps): - self.hook_optimizer() + def monitor_gnorm_with_ad(self, model, grad_acc_steps=1, optimizer=None): + self.hook_optimizer(optimizer) self.micro_batch_number = grad_acc_steps self.backward_only = True self._register_param_name(model) self._hook_model_for_grad_acc(model) - self._hook_weights() + # self._hook_weights() self.hook_modules(model, grad_acc_steps) @@ -336,7 +336,7 @@ class TrainerMon: self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_accumulated') - def hook_optimizer(self): + def hook_optimizer(self, optimizer=None): # in DDP by default use params_have_main_grad def optimizer_pre_step_hook(optimizer, args, kwargs): context = self.optimizer_context[optimizer] @@ -423,11 +423,26 @@ class TrainerMon: return + def patch_step(func, optimizer): + def wrapper(*args, **kwargs): + optimizer_pre_step_hook(optimizer, args, kwargs) + out = func(*args, **kwargs) + optimizer_post_step_hook(optimizer, args, kwargs) + return out + + return wrapper + if self.optimizer_hooked: return - if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list): - register_optimizer_step_pre_hook(optimizer_pre_step_hook) - register_optimizer_step_post_hook(optimizer_post_step_hook) + + if optimizer: + optimizer.__class__.step = patch_step( + optimizer.__class__.step, optimizer) + + else: + if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list): + register_optimizer_step_pre_hook(optimizer_pre_step_hook) + register_optimizer_step_post_hook(optimizer_post_step_hook) self.optimizer_hooked = True return -- Gitee From 60a038fb9a0ad42c532fca29500f169e67df41f0 Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 12 Aug 2024 06:02:23 +0000 Subject: [PATCH 55/94] update ut --- .../kj600/unittest/config_grad_monitor.json | 3 +- .../kj600/kj600/unittest/test_grad_monitor.py | 169 +++++++++++++----- 2 files changed, 125 insertions(+), 47 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json b/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json index 7f5096bec90..293776e34ec 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json +++ b/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json @@ -13,6 +13,7 @@ "ops": [ "min", "max", - "norm" + "norm", + "mean" ] } \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index d5035380159..c7eef841298 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -1,15 +1,44 @@ import torch import json import os +import shutil import pandas as pd +import unittest import torch.multiprocessing as mp from torch.utils.data import DistributedSampler, DataLoader, Dataset from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist -import torch_npu +# import torch_npu from kj600.module_hook import TrainerMon -from kj600.unittest.cc_utils import ddp_setup, BACKEND +from kj600.unittest.cc_utils import ddp_setup, BACKEND, DEVICE + +class ToyOptimizer: + def __init__(self, opt, grad_clip=1., mp_group=None): + self.opt = opt + self.grad_clip = grad_clip + self.mp_group = mp_group + + def step(self): + total_norm = 0. + for param_group in self.opt.param_groups: + for parameter in param_group['params']: + total_norm += parameter.grad.norm()**2 + + if self.mp_group: + dist.all_reduce(total_norm, group = self.mp_group) + + total_norm = total_norm ** 0.5 + coef = self.grad_clip / total_norm + if coef < 1.: + for param_group in self.opt.param_groups: + for parameter in param_group['params']: + parameter.grad *= coef + self.opt.step() + return total_norm + + def zero_grad(self): + self.opt.zero_grad() class Model(torch.nn.Module): def __init__(self): @@ -30,11 +59,15 @@ class ToyDataset(Dataset): return idx, self.data[idx], self.labels[idx] def get_config(): - os.environ["KJ600_OUTPUT_DIR"] = "./test_kj600_output" with open(os.path.join(os.path.dirname(__file__),"config_basic_functions.json"), 'r') as file: config_test = json.load(file) return config_test +def clean_output(): + return + folder_path = os.environ.get("KJ600_OUTPUT_DIR") + + def get_grad(model, res): for name, parameter in model.named_parameters(): if name not in res: @@ -58,44 +91,49 @@ def init_parallel(model_parallel, data_parallel): ranks = range(start, end) if rank in ranks: mp_group = dist.new_group(ranks) - # mp_group = list(ranks) + print(rank, 'mp', ranks) - for j in range(num_dp_group): - ranks = range(j, model_parallel*data_parallel, model_parallel) - if rank in ranks: - dp_group = dist.new_group(ranks) - # dp_group = list(ranks) + for j in range(num_dp_group): + ranks = range(j, model_parallel*data_parallel, model_parallel) + if rank in ranks: + dp_group = dist.new_group(ranks) + print(rank, 'dp', ranks) return dp_group, mp_group -def train(rank, world_size): +def train(rank, world_size, res): ddp_setup(rank, world_size) - - device = f'npu:{rank}' + device = f'{DEVICE}:{rank}' + grad_clip = 0.5 mbs = 2 gbs = 16 - model_parallel = 1 + model_parallel = 4 data_parallel = world_size // model_parallel grad_acc = gbs/mbs//data_parallel dp_group, mp_group = init_parallel(model_parallel, data_parallel) - # print(rank, dp_group, mp_group) model = Model().to(device=device) + pt_opt = torch.optim.AdamW(model.parameters(), lr=0.0001) + optimizer = ToyOptimizer(pt_opt, grad_clip, mp_group) + + hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), False) # or opt_ty=Megatron_DistributedOptimizer - hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc) + hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc, optimizer=optimizer) + file_path = hooker.summary_writer.log_dir train_ds = ToyDataset() sampler = DistributedSampler(train_ds, num_replicas=data_parallel, rank=dist.get_group_rank(dp_group, rank)) train_loader = DataLoader(train_ds, batch_size=mbs, sampler=sampler) - optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001) + unreduced = {} reduced = {} micro_step = 0 step = 0 + total_norms = [] optimizer.zero_grad() for (idx, inputs, targets) in train_loader: micro_step += 1 @@ -126,47 +164,86 @@ def train(rank, world_size): start += numel get_grad(model, reduced) - - total_norm = 0. - for parameter in model.parameters(): - total_norm += parameter.grad.norm()**2 - - # dist.all_reduce(total_norm, group = mp_group) - total_norm = total_norm ** 0.5 + total_norm = optimizer.step() + total_norms.append(total_norm.item()) print(f"step {step} | rank {rank} | total_norm {total_norm.item()} | loss {loss.item()}") - - optimizer.step() + optimizer.zero_grad() step += 1 - reduced_monitor, unreduced_monitor = get_csv_files(hooker.summary_writer.log_dir, step) - check_grad(reduced, unreduced, reduced_monitor, unreduced_monitor, step) - + res[rank] = [file_path, step, total_norms, dist.get_process_group_ranks(mp_group), dist.get_process_group_ranks(dp_group)] + def assert_info(a, b): assert round(a,5)==round(b,5), f'{a}, {b}' -def get_csv_files(logdir, step): - return [pd.read_csv(os.path.join(logdir, f'grad_reduced_{i}.csv')) for i in range(step)], \ - [pd.read_csv(os.path.join(logdir, f'grad_unreduced_{i}.csv')) for i in range(step)] - -def check_grad(reduced, unreduced, reduced_monitor, unreduced_monitor, step): - for i in range(step): - try: - assert_info(unreduced_monitor[i].loc[0,'norm'], unreduced['fc.weight'][i].item()) - assert_info(reduced_monitor[i].loc[0,'norm'], reduced['fc.weight'][i].item()) - except AssertionError as e: - print(f'mismatch in rank {dist.get_rank()} step {i}: (monitored | from training) {e}') - -def check_total_norm(total_norm): - pass - +def get_csv_files(meta): + res = {} + for rank, m in meta.items(): + logdir = m[0] + step = m[1] + res[rank] = [[pd.read_csv(os.path.join(logdir, f'grad_reduced_{i}.csv')) for i in range(step)], \ + [pd.read_csv(os.path.join(logdir, f'grad_unreduced_{i}.csv')) for i in range(step)]] + + return res + +class TestKj600(unittest.TestCase): + def __init__(self, method_name: str) -> None: + super(TestKj600, self).__init__(method_name) + self.config_test = get_config() + self.res = None + + @classmethod + def setUpClass(self): + self.world_size = 4 + torch.manual_seed(1234) + with mp.Manager() as manager: + meta = manager.dict() + mp.spawn(train, args=(self.world_size,meta,), nprocs=self.world_size) + self.meta = meta.copy() + + def setUp(self): + self.config_test = get_config() + self.steps = self.meta[0][1] + self.res = get_csv_files(self.meta) + + def test_mean(self): + for rank in range(self.world_size): + dp_group = self.meta[rank][4] + for step in range(self.steps): + mean = 0. + for dp_stage in dp_group: + mean += self.res[dp_stage][1][step].loc[0,'mean'] + reduced_mean = self.res[rank][0][step].loc[0,'mean'] + print(f'checking mean of step {step}, rank {rank}', mean, reduced_mean) + assert_info(mean, reduced_mean) + + def test_gnorm(self): + for rank in range(self.world_size): + mp_group = self.meta[rank][3] + for step in range(self.steps): + total_norm = 0. + for mp_stage in mp_group: + total_norm += self.res[mp_stage][0][step].loc[0,'norm']**2 + print(f'checking total norm of step {step}, rank {rank}', total_norm, self.meta[rank][2][step]**2) + assert_info(total_norm, self.meta[rank][2][step]**2) + + @classmethod + def tearDownClass(cls) -> None: + return + self.clean_output() + + def clean_output(self): + return + for rank in range(self.world_size): + folder_path = self.meta[rank][0] + if os.path.exists(folder_path): + shutil.rmtree(folder_path) + if __name__ == "__main__": - world_size=4 - torch.manual_seed(1234) - mp.spawn(train, args=(world_size,), nprocs=world_size) + unittest.main() -- Gitee From a6c278171fc6fe1c689ebc762b070025c8b63768 Mon Sep 17 00:00:00 2001 From: qianggee Date: Mon, 12 Aug 2024 06:59:11 +0000 Subject: [PATCH 56/94] update ut --- .../kj600/kj600/unittest/test_grad_monitor.py | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index c7eef841298..b3841f22bb3 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -59,13 +59,14 @@ class ToyDataset(Dataset): return idx, self.data[idx], self.labels[idx] def get_config(): - with open(os.path.join(os.path.dirname(__file__),"config_basic_functions.json"), 'r') as file: + with open(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), 'r') as file: config_test = json.load(file) return config_test def clean_output(): - return folder_path = os.environ.get("KJ600_OUTPUT_DIR") + if os.path.exists(folder_path): + shutil.rmtree(folder_path) def get_grad(model, res): @@ -91,13 +92,13 @@ def init_parallel(model_parallel, data_parallel): ranks = range(start, end) if rank in ranks: mp_group = dist.new_group(ranks) - print(rank, 'mp', ranks) + print(f'rank {rank} in model parallel group {list(ranks)}') for j in range(num_dp_group): ranks = range(j, model_parallel*data_parallel, model_parallel) if rank in ranks: dp_group = dist.new_group(ranks) - print(rank, 'dp', ranks) + print(f'rank {rank} in data parallel group {list(ranks)}') return dp_group, mp_group @@ -108,7 +109,7 @@ def train(rank, world_size, res): grad_clip = 0.5 mbs = 2 gbs = 16 - model_parallel = 4 + model_parallel = 1 data_parallel = world_size // model_parallel grad_acc = gbs/mbs//data_parallel @@ -118,7 +119,6 @@ def train(rank, world_size, res): pt_opt = torch.optim.AdamW(model.parameters(), lr=0.0001) optimizer = ToyOptimizer(pt_opt, grad_clip, mp_group) - hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), False) # or opt_ty=Megatron_DistributedOptimizer hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc, optimizer=optimizer) file_path = hooker.summary_writer.log_dir @@ -127,7 +127,6 @@ def train(rank, world_size, res): sampler = DistributedSampler(train_ds, num_replicas=data_parallel, rank=dist.get_group_rank(dp_group, rank)) train_loader = DataLoader(train_ds, batch_size=mbs, sampler=sampler) - unreduced = {} reduced = {} @@ -197,6 +196,7 @@ class TestKj600(unittest.TestCase): @classmethod def setUpClass(self): + os.environ["KJ600_OUTPUT_DIR"] = "./test_kj600_output" self.world_size = 4 torch.manual_seed(1234) with mp.Manager() as manager: @@ -227,20 +227,14 @@ class TestKj600(unittest.TestCase): total_norm = 0. for mp_stage in mp_group: total_norm += self.res[mp_stage][0][step].loc[0,'norm']**2 - print(f'checking total norm of step {step}, rank {rank}', total_norm, self.meta[rank][2][step]**2) + print(f'checking total norm of step {step}, rank {rank}', total_norm**0.5, self.meta[rank][2][step]) assert_info(total_norm, self.meta[rank][2][step]**2) @classmethod def tearDownClass(cls) -> None: - return - self.clean_output() + clean_output() + - def clean_output(self): - return - for rank in range(self.world_size): - folder_path = self.meta[rank][0] - if os.path.exists(folder_path): - shutil.rmtree(folder_path) if __name__ == "__main__": -- Gitee From 655166b97f1ea02624ffe85e162ab005998c9a3f Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 01:51:12 +0000 Subject: [PATCH 57/94] remove duplicate reduced grad --- .../accuracy_tools/kj600/kj600/module_hook.py | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 032baa17754..5a46926ec72 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -24,6 +24,15 @@ try: except ImportError: pass + +def param_is_not_tensor_parallel_duplicate(param, tp_group): + return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel) or ( + torch.distributed.get_rank(group=tp_group) == 0 + ) + +def param_is_data_parallel_duplicate(dp_group): + return torch.distributed.get_rank(group=dp_group) != 0 + class ModuleHookContext: def __init__(self, module_name) -> None: self.step = 0 @@ -156,9 +165,7 @@ class TrainerMon: # anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None anomaly_inform = None - self.optimizer_hooked = False - self.param_registered = False - self.vpp = False + output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] @@ -184,6 +191,8 @@ class TrainerMon: self.weight_hooked = False self.optimizer_hooked = False + self.param_registered = False + self.vpp = False self.param2name = defaultdict(str) @@ -227,18 +236,14 @@ class TrainerMon: if not isinstance(model, list): model = [model] + self._register_param_name(model) + self.micro_batch_number = grad_acc_steps for vpp_stage, model_chunk in enumerate(model): vpp_stage = f'{vpp_stage}_' if self.vpp else '' targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config['targets'].keys() hooked_count = self._hook_module(targets, model_chunk, vpp_stage) print_rank_0(f"> {hooked_count} out of {len(self.config['targets'])} are monitored.") - print_rank_0("> parameter names:") - for name, param in model_chunk.named_parameters(): - print_rank_0(f"\t{name}") - for target_module, _ in self.config['targets'].items(): - if name.startswith(target_module) and param.requires_grad: # name : language_model.encoder.layers.0.mlp.weight, target_module:language_model.encoder.layers.0 - self.param2name[param] = name if not self.optimizer_hooked: self.hook_optimizer() @@ -247,6 +252,11 @@ class TrainerMon: def _get_wg_metric(self, tag): grad_dict = {} for param, name in self.param2name.items(): + if tag == 'post_grad': + if self.tp_group and not param_is_not_tensor_parallel_duplicate(param, self.tp_group): + continue + if self.dp_group and param_is_data_parallel_duplicate(self.dp_group): + continue grad = param.main_grad if self.params_have_main_grad else param.grad if grad is None: print_warn_log(f"grad is None: {name}, maybe something wrong happened.") @@ -259,14 +269,18 @@ class TrainerMon: return metric_dict - def monitor_gnorm_with_ad(self, model, grad_acc_steps=1, optimizer=None): + def monitor_gnorm_with_ad(self, model, grad_acc_steps=1, optimizer=None, tp_group=None, dp_group=None): + print_info_log(f'grad acc steps {grad_acc_steps}') self.hook_optimizer(optimizer) self.micro_batch_number = grad_acc_steps self.backward_only = True + self.dp_group = dp_group + self.tp_group = tp_group + self._register_param_name(model) self._hook_model_for_grad_acc(model) - # self._hook_weights() + self._hook_weights() self.hook_modules(model, grad_acc_steps) @@ -592,9 +606,10 @@ class TrainerMon: def _hook_model_for_grad_acc(self, model): def model_backward_hook(module, input_grad, output_grad): model_chunk.micro_step += 1 ## error if vpp - if model_chunk.micro_step == (self.micro_batch_number): + if model_chunk.micro_step == 1: model_chunk.micro_step = 0 wg_metric_dict = self._get_wg_metric(tag='pre_grad') + print_rank_0(wg_metric_dict['norm']) self.grad_context.pre.append(wg_metric_dict) if not isinstance(model, list): @@ -610,6 +625,7 @@ class TrainerMon: def param_hook(grad, grad_acc): with torch.no_grad(): grad_acc += grad + print_rank_0('acc', grad.norm()) for param, name in self.param2name.items(): key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) -- Gitee From 462aa3c5c8448d62f79675bcfc76e32be241c3de Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 01:51:47 +0000 Subject: [PATCH 58/94] check the grad with total grad norm --- debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index b3841f22bb3..66afce8b72c 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -234,9 +234,6 @@ class TestKj600(unittest.TestCase): def tearDownClass(cls) -> None: clean_output() - - - if __name__ == "__main__": unittest.main() -- Gitee From ea88b17b1d1890e03255aabb147a941e5c489308 Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 02:18:57 +0000 Subject: [PATCH 59/94] disable weight hook --- debug/accuracy_tools/kj600/kj600/module_hook.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 5a46926ec72..f59d8bf18be 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -280,7 +280,7 @@ class TrainerMon: self._register_param_name(model) self._hook_model_for_grad_acc(model) - self._hook_weights() + # self._hook_weights() self.hook_modules(model, grad_acc_steps) @@ -606,10 +606,9 @@ class TrainerMon: def _hook_model_for_grad_acc(self, model): def model_backward_hook(module, input_grad, output_grad): model_chunk.micro_step += 1 ## error if vpp - if model_chunk.micro_step == 1: + if model_chunk.micro_step == self.micro_batch_number: model_chunk.micro_step = 0 wg_metric_dict = self._get_wg_metric(tag='pre_grad') - print_rank_0(wg_metric_dict['norm']) self.grad_context.pre.append(wg_metric_dict) if not isinstance(model, list): -- Gitee From b2acaa3e5679de501318028e022e455e32b843dc Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 02:38:45 +0000 Subject: [PATCH 60/94] update readme --- debug/accuracy_tools/kj600/README.md | 56 ++++++++++++++++++---------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index 874cfaea31d..e6a8405f403 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -20,16 +20,10 @@ ### 2. 安装 kj600 -方式一:从 git 直接安装 +方式一:下载源码安装 ``` -pip install git+https://gitee.com/xiangsen2/kj600.git -``` - -方式二:下载源码安装 - -``` -git clone https://gitee.com/xiangsen2/kj600.git +git clone https://gitee.com/qiangge123a/att.git cd kj600 pip install . ``` @@ -40,25 +34,31 @@ pip install . 1. 输出目录 -监控结果写入tensorboard的event文件/csv中,设置输出路径(默认为`kj600_output`) - +监控结果写入tensorboard的event文件/csv中,设置输出路径(默认为`kj600_output`,通过环境变量配置) ```bash export KJ600_OUTPUT_DIR=/xxx/output_dir -tensorboard --logdir=$KJ600_OUTPUT_DIR ``` -2. 在训练脚本中使能工具 +2. 在训练脚本中使能工具(Megatron-LM) ``` from kj600.module_hook import TrainerMon model, optimizer, opt_param_scheduler = setup_model_and_optimizer( model_provider, model_type) -# 模型初始化后插入工具代码 -hooker = TrainerMon("./monitor_config.json", params_have_main_grad=True) -hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size) +# 模型、优化器初始化后插入工具代码 +hooker.monitor_gnorm_with_ad( + model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size, optimizer=optimizer, dp_group=mpu.get_data_parallel_group(), tp_group=mpu.get_tensor_model_parallel_group()) ``` + +| 字段名字 | 是否必选 | 解释 | +| ------------------------------------------------------------ | -------- | -------- | +|"grad_acc_steps"| 必选 |梯度累积的步数,当micro step=grad acc steps时,会触发反向hook获取模型梯度| +|"optimizer"| 可选 |各种并行域reduce后的梯度在opt.step前获取,数据写入在step后进行。默认patch pytorch的优化器,传入其他优化器(如MegatronOptimizer)可以调整工具行为,如clip_grad发生在megatron的优化器中,pytorch的优化器之前。| +|"dp_group"| 可选 |训练过程中的dp_group。dp域通信后,group内所有rank的梯度相同,落盘数据冗余。提供dp_group后,工具仅保留每个dp_group的第一个rank的梯度| +|"tp_group"| 可选 |训练过程中的tp_group。tp域通信后,group内部分参数所有rank的梯度相同,落盘数据冗余。提供tp_group后,工具仅保留每个tp_group中冗余参数在第一个rank的梯度。当前适配Megatron core_v0.6.0, 通过权重属性`tensor_model_parallel`判断是否冗余| + 3. 在json文件中配置工具 ``` { @@ -70,15 +70,33 @@ hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=args.global_batch_size/ "module_ranks": [0,1,2,3], # 需要监控的rank "wg_distribution": true, "format": "csv", - "alert": { - "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}] - }, "ops": ["norm", "min", "max", "mean"], "eps": 1e-8 } ``` - +4. 结果验证 +训练日志中通常会打屏一个训练步的grad norm。提供了脚本校验落盘数据和打屏信息的一致性。 +```bash +python kj600/unittest/test_monitor.py -m kj600_output/Aug13_02-27-5 -l logs/train_gpt3_TP2_PP1_VPP2_CP1_monitor.log +``` +`-m`指定落盘csv的路径前缀。'-l'指定训练日志。脚本通过关键词`grad norm: `匹配训练日志中的grad norm,根据实际情况修改。从落盘数据计算的grad norm和日志中的grad norm相对偏差超过1%,会有警告。 +示例输出: +```txt +world size: 4 +rank 2 is duplicated in dp group +rank 3 is duplicated in dp group +checking total norm of step 0 +checking total norm of step 1 +checking total norm of step 2 +checking total norm of step 3 +checking total norm of step 4 +checking total norm of step 5 +checking total norm of step 6 +checking total norm of step 7 +checking total norm of step 8 +checking total norm of step 9 +``` ## 详细配置 下面以Ascend/ModelLink训练框架为例,给出kj600工具的使用方法。 -- Gitee From e7667c7badc7326afcb84166c6e5ce306af73395 Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 02:45:09 +0000 Subject: [PATCH 61/94] fix typo --- debug/accuracy_tools/kj600/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index e6a8405f403..c517f485626 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -80,7 +80,7 @@ hooker.monitor_gnorm_with_ad( ```bash python kj600/unittest/test_monitor.py -m kj600_output/Aug13_02-27-5 -l logs/train_gpt3_TP2_PP1_VPP2_CP1_monitor.log ``` -`-m`指定落盘csv的路径前缀。'-l'指定训练日志。脚本通过关键词`grad norm: `匹配训练日志中的grad norm,根据实际情况修改。从落盘数据计算的grad norm和日志中的grad norm相对偏差超过1%,会有警告。 +`-m`指定落盘csv的路径前缀。`-l`指定训练日志。脚本通过关键词`grad norm: `匹配训练日志中的grad norm,根据实际情况修改。从落盘数据计算的grad norm和日志中的grad norm相对偏差超过1%,会有警告。 示例输出: ```txt world size: 4 -- Gitee From 9b830fd751988c0453a215052799e9a05f0acdec Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 06:01:32 +0000 Subject: [PATCH 62/94] add test --- .../kj600/kj600/unittest/test_monitor.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py new file mode 100644 index 00000000000..60500dd927e --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py @@ -0,0 +1,75 @@ +import sys +import os +import re +import argparse +import pandas as pd +from glob import glob +from collections import defaultdict + + +def parse_logfile(logfile): + grad_norm = [] + step = [] + with open(logfile) as f: + for line in f.readlines(): + if 'consumed samples' in line: + grad_norm.append(float(re.findall('(?<=grad norm\: )[\d\.]*', line)[0])) + # step = int(re.findall('(?<=iteration)[ \d]*', line)[0]) + return grad_norm + + +def parse_monitor_output(output_dir): + reduced = {} + unreduced = {} + for dir in glob(output_dir+'*'): + rank = int(re.findall('(?<=rank)[\d]*', dir)[0]) + unreduced[rank] = [] + reduced[rank] = [] + for file in os.listdir(dir): + df = pd.read_csv(os.path.join(dir, file)) + if '_unreduced_' in file: + unreduced[rank].append(df) + pass + elif '_reduced_' in file: + reduced[rank].append(df) + elif 'acc' in file: + # unreduced[rank].append(df) + pass + else: + print(f'unexpected file {file} in {dir}') + return reduced, unreduced + + +def assert_equal(a, b): + assert abs(a/b-1)<0.01, f'{a}, {b}' + + +def valid_total_norm(total_norm, reduced): + steps = len(total_norm) + world_size = len(reduced) + print('world size: ', world_size) + for step in range(steps): + calculated_norm = 0. + dp_size = 0 + for rank in range(world_size): + if len(reduced[rank]) == 0: + if step == 0: + print(f'rank {rank} is duplicated in dp group') + continue + for index, row in reduced[rank][step].iterrows(): + calculated_norm += row['norm']**2 + dp_size += 1 + print(f'checking total norm of step {step}') + assert_equal(calculated_norm**0.5, total_norm[step]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--monitor_output', '-m', type=str, required=True, help='path prefix to the output of monitor e.g. kj600_output/Aug12_07-16') + parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file') + + args = parser.parse_args() + total_norm = parse_logfile(args.logfile) + reduced, unreduced = parse_monitor_output(args.monitor_output) + + valid_total_norm(total_norm, reduced) \ No newline at end of file -- Gitee From e87430112c813c8733a15be84a0b61e10ec38615 Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 15:02:14 +0000 Subject: [PATCH 63/94] update ut --- .../kj600/kj600/unittest/test_monitor.py | 75 +++++++++++++++---- 1 file changed, 62 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py index 60500dd927e..5f16bbeac56 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py @@ -26,31 +26,71 @@ def parse_monitor_output(output_dir): unreduced[rank] = [] reduced[rank] = [] for file in os.listdir(dir): + # step = int(re.search("(?<=reduced\_)[\d]*", file)[0]) + # if step != 0: + # continue df = pd.read_csv(os.path.join(dir, file)) if '_unreduced_' in file: unreduced[rank].append(df) pass elif '_reduced_' in file: reduced[rank].append(df) - elif 'acc' in file: - # unreduced[rank].append(df) - pass else: print(f'unexpected file {file} in {dir}') return reduced, unreduced - + +def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel): + steps = len(reduced[0]) + world_size = len(reduced) + errors = [] + for index, row in unreduced[0][0].iterrows(): + param = row['param_name'] + is_tp_duplicate = False + for step in range(2): + # sum reduced + reduced_mean = 0. + for rank in range(world_size): + if len(reduced[rank]) == 0: + continue + df = reduced[rank][step] + value = list(df[df['param_name'] == param]['mean']) + if value == []: + if step == 0: + is_tp_duplicate = True + continue + reduced_mean += value[0] + + # sum unreduced + unreduced_mean = 0. + for rank in range(world_size): + df = unreduced[rank][step] + unreduced_mean += list(df[df['param_name'] == param]['mean'])[0] + + unreduced_mean /= dp_size + if is_tp_duplicate and (not sequence_parallel or 'embedding' in param): + unreduced_mean /= tp_size + try: + assert_equal(unreduced_mean, reduced_mean) + except AssertionError as e: + errors.append(param, step, e, is_tp_duplicate) + if errors: + print(errors) + else: + print(f'grad mean is in consisten between unreduced grad and reduced grad monitord.') + + def assert_equal(a, b): - assert abs(a/b-1)<0.01, f'{a}, {b}' + rel_diff = abs(a/b-1) + assert rel_diff<0.01, f'{a}, {b}, {rel_diff}' -def valid_total_norm(total_norm, reduced): +def valid_total_norm(total_norm, reduced, dp_size): steps = len(total_norm) world_size = len(reduced) - print('world size: ', world_size) + errors = [] for step in range(steps): calculated_norm = 0. - dp_size = 0 for rank in range(world_size): if len(reduced[rank]) == 0: if step == 0: @@ -58,18 +98,27 @@ def valid_total_norm(total_norm, reduced): continue for index, row in reduced[rank][step].iterrows(): calculated_norm += row['norm']**2 - dp_size += 1 - print(f'checking total norm of step {step}') - assert_equal(calculated_norm**0.5, total_norm[step]) - + try: + assert_equal(calculated_norm**0.5, total_norm[step]) + except AssertionError as e: + errors.append([step, e]) + if errors: + print('total norm errors: ', errors) + else: + print('grad norm in consiste between training log and reduced gradients monitored') + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--monitor_output', '-m', type=str, required=True, help='path prefix to the output of monitor e.g. kj600_output/Aug12_07-16') parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file') + parser.add_argument('--tp_size', '-t', type=int, required=True, help='tp parallel size') + parser.add_argument('--dp_size', '-d', type=int, required=True, help='dp parallel size') + parser.add_argument('--sequence_parallel', '-s', type=bool, default=True, help='whether sequence parallel is enabled') args = parser.parse_args() total_norm = parse_logfile(args.logfile) reduced, unreduced = parse_monitor_output(args.monitor_output) - valid_total_norm(total_norm, reduced) \ No newline at end of file + valid_total_norm(total_norm, reduced, args.dp_size) + valid_reduce(reduced, unreduced, args.tp_size, args.dp_size, args.sequence_parallel) \ No newline at end of file -- Gitee From 4cf143551b4e316bc0f8d7d9b69326779ce921f4 Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 15:14:14 +0000 Subject: [PATCH 64/94] adapt megatron --- .../accuracy_tools/kj600/kj600/module_hook.py | 44 +++++++------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index f59d8bf18be..e42c64ecdc8 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -102,8 +102,7 @@ class GradContext: self.post.clear() self.acc_metric.clear() self.actv.clear() - for k,v in self.acc.items(): - v.fill_(0.) + self.acc.clear() self.tensors.clear() @@ -253,6 +252,7 @@ class TrainerMon: grad_dict = {} for param, name in self.param2name.items(): if tag == 'post_grad': + pass if self.tp_group and not param_is_not_tensor_parallel_duplicate(param, self.tp_group): continue if self.dp_group and param_is_data_parallel_duplicate(self.dp_group): @@ -263,6 +263,7 @@ class TrainerMon: continue key = get_summary_writer_tag_name(name, tag, self.rank) grad_dict[key] = grad + metric_dict = {} for metric_name in self.ops: metric_dict[metric_name] = get_metrics(metric_name, grad_dict, self.eps) @@ -279,8 +280,7 @@ class TrainerMon: self.tp_group = tp_group self._register_param_name(model) - self._hook_model_for_grad_acc(model) - # self._hook_weights() + self._hook_weights() self.hook_modules(model, grad_acc_steps) @@ -344,10 +344,8 @@ class TrainerMon: if not self.wg_distribution: return - self.write_metrics(self.ops, self.summary_writer, self.grad_context.pre, step, 'grad_unreduced') self.write_metrics(self.ops, self.summary_writer, self.grad_context.post, step, 'grad_reduced') - if self.weight_hooked: - self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_accumulated') + self.write_metrics(self.ops, self.summary_writer, self.grad_context.acc_metric, step, 'grad_unreduced') def hook_optimizer(self, optimizer=None): @@ -490,6 +488,7 @@ class TrainerMon: for target in self.config['targets'].keys(): if param_name.startswith(target) and param.requires_grad: self._smallest_rank_print(f'>> monitoring: {name}') + setattr(param, "zero_out_wgrad", True) self.param2name[param] = name self.param_registered = True @@ -603,32 +602,19 @@ class TrainerMon: hooked_count += 1 return hooked_count - def _hook_model_for_grad_acc(self, model): - def model_backward_hook(module, input_grad, output_grad): - model_chunk.micro_step += 1 ## error if vpp - if model_chunk.micro_step == self.micro_batch_number: - model_chunk.micro_step = 0 - wg_metric_dict = self._get_wg_metric(tag='pre_grad') - self.grad_context.pre.append(wg_metric_dict) - - if not isinstance(model, list): - model = [model] - - for model_chunk in model: - setattr(model_chunk,'micro_step', 0) - model_chunk.register_full_backward_hook(model_backward_hook) - def _hook_weights(self): context = self.grad_context - def param_hook(grad, grad_acc): - with torch.no_grad(): - grad_acc += grad - print_rank_0('acc', grad.norm()) - + @torch.no_grad + def param_hook(grad, context_dict, param, key): + param.micro_step += 1 + if param.micro_step == self.micro_batch_number: + param.micro_step = 0 + context_dict[key] = (param.main_grad+grad).clone() if self.params_have_main_grad else (param.grad+grad).clone() + for param, name in self.param2name.items(): key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) - context.acc[key] = torch.zeros_like(param).to(param.device) - param.register_hook(partial(param_hook, grad_acc=context.acc[key])) + setattr(param, 'micro_step', 0) + param.register_hook(partial(param_hook, context_dict=context.acc, param=param, key=key)) self.weight_hooked = True -- Gitee From 2fd46c7595fa321317a486fb8d9bdfa39d27a075 Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 15:24:11 +0000 Subject: [PATCH 65/94] typo --- debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py index 5f16bbeac56..49cf8f2a075 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py @@ -76,7 +76,7 @@ def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel): if errors: print(errors) else: - print(f'grad mean is in consisten between unreduced grad and reduced grad monitord.') + print(f'grad mean is in consist between unreduced grad and reduced grad monitord.') @@ -105,7 +105,7 @@ def valid_total_norm(total_norm, reduced, dp_size): if errors: print('total norm errors: ', errors) else: - print('grad norm in consiste between training log and reduced gradients monitored') + print('grad norm in consist between training log and reduced gradients monitored') if __name__ == "__main__": @@ -114,7 +114,7 @@ if __name__ == "__main__": parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file') parser.add_argument('--tp_size', '-t', type=int, required=True, help='tp parallel size') parser.add_argument('--dp_size', '-d', type=int, required=True, help='dp parallel size') - parser.add_argument('--sequence_parallel', '-s', type=bool, default=True, help='whether sequence parallel is enabled') + parser.add_argument('--sequence_parallel', '-s', action="store_true", default=False, help='whether sequence parallel is enabled. Add -s to store true') args = parser.parse_args() total_norm = parse_logfile(args.logfile) -- Gitee From b83a065107a11cbc39223a9634f7719f529cbccf Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 15:30:15 +0000 Subject: [PATCH 66/94] fix bug --- debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py index 49cf8f2a075..dcd402ad34b 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py @@ -72,7 +72,7 @@ def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel): try: assert_equal(unreduced_mean, reduced_mean) except AssertionError as e: - errors.append(param, step, e, is_tp_duplicate) + errors.append([param, step, e, is_tp_duplicate]) if errors: print(errors) else: -- Gitee From 0f666973858ed3024f77074e4bb19da9a1407f4a Mon Sep 17 00:00:00 2001 From: qianggee Date: Tue, 13 Aug 2024 15:30:26 +0000 Subject: [PATCH 67/94] update readme --- debug/accuracy_tools/kj600/README.md | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index c517f485626..900ab3a0afb 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -78,25 +78,21 @@ hooker.monitor_gnorm_with_ad( 4. 结果验证 训练日志中通常会打屏一个训练步的grad norm。提供了脚本校验落盘数据和打屏信息的一致性。 ```bash -python kj600/unittest/test_monitor.py -m kj600_output/Aug13_02-27-5 -l logs/train_gpt3_TP2_PP1_VPP2_CP1_monitor.log +python kj600/unittest/test_monitor.py -m kj600_output/Aug13_02-27-5 -l logs/train_gpt3_TP2_PP1_CP1_monitor.log -d 2 -t 2 ``` -`-m`指定落盘csv的路径前缀。`-l`指定训练日志。脚本通过关键词`grad norm: `匹配训练日志中的grad norm,根据实际情况修改。从落盘数据计算的grad norm和日志中的grad norm相对偏差超过1%,会有警告。 +`-m`指定落盘csv的路径前缀。`-l`指定训练日志。脚本通过关键词`grad norm: `匹配训练日志中的grad norm,根据实际情况修改。从落盘数据计算的grad norm和日志中的grad norm相对偏差超过1%,会有警告。`-d`、`--dp_size`声明data parallel size,`-t`、`--tp_size`声明tensor paralllel size。 示例输出: ```txt -world size: 4 rank 2 is duplicated in dp group rank 3 is duplicated in dp group -checking total norm of step 0 -checking total norm of step 1 -checking total norm of step 2 -checking total norm of step 3 -checking total norm of step 4 -checking total norm of step 5 -checking total norm of step 6 -checking total norm of step 7 -checking total norm of step 8 -checking total norm of step 9 +grad norm in consiste between training log and reduced gradients monitored +grad mean is in consisten between unreduced grad and reduced grad monitord. ``` +需要提供并行相关参数,具体参见: +```bash +python kj600/unittest/test_monitor.py -h +``` + ## 详细配置 下面以Ascend/ModelLink训练框架为例,给出kj600工具的使用方法。 -- Gitee From 87c9fac4d5b3e2b66900146ca826a95ace60af4b Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 01:53:44 +0000 Subject: [PATCH 68/94] update test for pp --- .../kj600/kj600/unittest/test_monitor.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py index dcd402ad34b..b15e7474246 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py @@ -64,6 +64,9 @@ def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel): unreduced_mean = 0. for rank in range(world_size): df = unreduced[rank][step] + value = list(df[df['param_name'] == param]['mean']) + if value == []: + continue unreduced_mean += list(df[df['param_name'] == param]['mean'])[0] unreduced_mean /= dp_size @@ -85,7 +88,7 @@ def assert_equal(a, b): assert rel_diff<0.01, f'{a}, {b}, {rel_diff}' -def valid_total_norm(total_norm, reduced, dp_size): +def valid_total_norm(total_norm, reduced, duplicate_embedding): steps = len(total_norm) world_size = len(reduced) errors = [] @@ -97,6 +100,8 @@ def valid_total_norm(total_norm, reduced, dp_size): print(f'rank {rank} is duplicated in dp group') continue for index, row in reduced[rank][step].iterrows(): + if duplicate_embedding and 'word_embedding' in row['param_name']: + continue calculated_norm += row['norm']**2 try: assert_equal(calculated_norm**0.5, total_norm[step]) @@ -114,11 +119,15 @@ if __name__ == "__main__": parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file') parser.add_argument('--tp_size', '-t', type=int, required=True, help='tp parallel size') parser.add_argument('--dp_size', '-d', type=int, required=True, help='dp parallel size') + parser.add_argument('--pp_size', '-p', type=int, required=True, help='pp parallel size') + parser.add_argument('--untie_embeddings_and_output_weights', '-u', action="store_true", default=False, help='whether untie_embeddings_and_output_weights in pp parallel') parser.add_argument('--sequence_parallel', '-s', action="store_true", default=False, help='whether sequence parallel is enabled. Add -s to store true') args = parser.parse_args() total_norm = parse_logfile(args.logfile) reduced, unreduced = parse_monitor_output(args.monitor_output) - valid_total_norm(total_norm, reduced, args.dp_size) + duplicate_embedding = not args.untie_embeddings_and_output_weights and args.pp_size > 1 + + valid_total_norm(total_norm, reduced, duplicate_embedding) valid_reduce(reduced, unreduced, args.tp_size, args.dp_size, args.sequence_parallel) \ No newline at end of file -- Gitee From daedfc8cca5a4d73e1ecf095fdc3af3a3d44ab8b Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 01:54:00 +0000 Subject: [PATCH 69/94] update --- debug/accuracy_tools/kj600/kj600/module_hook.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index e42c64ecdc8..9fec4e01549 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -610,7 +610,13 @@ class TrainerMon: param.micro_step += 1 if param.micro_step == self.micro_batch_number: param.micro_step = 0 - context_dict[key] = (param.main_grad+grad).clone() if self.params_have_main_grad else (param.grad+grad).clone() + if self.params_have_main_grad: + context_dict[key] = (param.main_grad+grad).clone() + elif param.grad is not None: + context_dict[key] = (param.grad+grad).clone() + else: + context_dict[key] = grad.clone() + for param, name in self.param2name.items(): key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) -- Gitee From 4b4889e95c01fd74ca20b89b7e5844144956148e Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 03:31:12 +0000 Subject: [PATCH 70/94] add option ndigits; squash name --- debug/accuracy_tools/kj600/kj600/anomaly_detect.py | 10 ++++++---- debug/accuracy_tools/kj600/kj600/module_hook.py | 9 +++++---- debug/accuracy_tools/kj600/kj600/module_metric.py | 13 +++++++++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 563fba39e61..63a102287b7 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -85,7 +85,7 @@ class BaseWriterWithAD: return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) class CSVWriterWithAD(BaseWriterWithAD): - def __init__(self, path, ad_rules, job_id, anomaly_inform=False): + def __init__(self, path, ad_rules, job_id, anomaly_inform=False, ndigits=6): self.log_dir = path create_directory(path) self.tag2scalars = defaultdict(list) @@ -94,6 +94,7 @@ class CSVWriterWithAD(BaseWriterWithAD): self.anomaly_inform = anomaly_inform self.context_dict = defaultdict(list) self.header = [] + self.ndigits = ndigits def write_csv(self, prefix, step): if len(self.context_dict) == 0: @@ -105,7 +106,7 @@ class CSVWriterWithAD(BaseWriterWithAD): data_frame.to_csv(filepath, index=False) check_file_valid_writable(filepath) - new_data = pd.DataFrame([[name]+metric_value for name, metric_value in self.context_dict.items()]) + new_data = pd.DataFrame([[name]+metric_value if 'vpp' not in name else name.lstrip('vpp').split(':')+metric_value for name, metric_value in self.context_dict.items()]) new_data.to_csv(filepath, mode='a+', header=False, index=False) self.context_dict = defaultdict(list) @@ -113,18 +114,19 @@ class CSVWriterWithAD(BaseWriterWithAD): super()._add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) name = tag.split('/')[0] - self.context_dict[name].append(scalar_value) + self.context_dict[name].append(round(scalar_value, self.ndigits)) def close(self): pass class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD): - def __init__(self, path, ad_rules, job_id, anomaly_inform=False): + def __init__(self, path, ad_rules, job_id, anomaly_inform=False, ndigits=6): super().__init__(path) self.tag2scalars = defaultdict(list) self.ad_rules = ad_rules self.job_id = job_id self.anomaly_inform = anomaly_inform + self.ndigits = ndigits def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): super()._add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 9fec4e01549..8243796fdb6 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -14,7 +14,7 @@ from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD, CSVWriterWithAD # from kj600.anomaly_inform import AnomalyInformFactory -from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics +from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics, squash_param_name from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate from kj600.utils import print_warn_log, print_info_log, get_param_struct @@ -124,6 +124,7 @@ class TrainerMon: self.format = self.config.get('format', 'tensorboard') self.eps = self.config.get('eps', 1e-8) self.ops = self.config.get('ops', []) + self.ndigits = self.config.get('ndigits', 6) self.xy_distribution = self.config.get('xy_distribution', False) if not self.xy_distribution: print_rank_0("> module input/output input_grad/output_grad is not monitored. ") @@ -178,7 +179,7 @@ class TrainerMon: if dist.is_initialized(): if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0: self.summary_writer = writer( - os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) + os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform, self.ndigits) else: self.summary_writer = writer(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, unique_id, anomaly_inform) # A HeatmapVisualizer instance is associated with an image @@ -482,9 +483,9 @@ class TrainerMon: self._smallest_rank_print('vpp enabled') for vpp_stage, model_chunk in enumerate(model): - prefix = f'{vpp_stage}_' if self.vpp else '' + prefix = f'vpp{vpp_stage}:' if self.vpp else '' for param_name, param in model_chunk.named_parameters(): - name = prefix + param_name + name = prefix + squash_param_name(param_name) for target in self.config['targets'].keys(): if param_name.startswith(target) and param.requires_grad: self._smallest_rank_print(f'>> monitoring: {name}') diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 42bec259615..0605a7f8178 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -1,4 +1,5 @@ import math +import re import statistics from kj600.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm, get_mean @@ -10,6 +11,13 @@ def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): else: return f"{module_or_param_name}/{rank}/{tag}" +def squash_param_name(param_name): + name = '' + for pattern in ['(?<=layers.)[d]*.*', '(?<=embedding.).*', 'final.*', 'output.*']: + match = re.findall(pattern, param_name) + if match: + name += match[0] + return name # 用于存储所有metric实现类的注册表 config_metric_registry = {} @@ -186,5 +194,10 @@ def write_metrics_csv(ops, summary_writer, metric_value, step, prefix=''): summary_writer.header = ['param_name'] + ['input_'+op for op in ops] + ['output_'+op for op in ops] else: summary_writer.header = ['param_name'] + ops + + for key in metric_value[0][ops[0]].keys(): + if 'vpp' in key: + summary_writer.header.insert(0, 'vpp_stage') + break summary_writer.write_csv(prefix, step) summary_writer.header = [] \ No newline at end of file -- Gitee From 586c16cbea5b198c3eff5698a55c1cdffecbb29a Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 03:35:29 +0000 Subject: [PATCH 71/94] update readme --- debug/accuracy_tools/kj600/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index 900ab3a0afb..44c12c96368 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -71,7 +71,8 @@ hooker.monitor_gnorm_with_ad( "wg_distribution": true, "format": "csv", "ops": ["norm", "min", "max", "mean"], - "eps": 1e-8 + "eps": 1e-8, + "ndigits: 6 } ``` -- Gitee From 1ac654ae2ded92f6bdd3a1667580fa2eaf8338fd Mon Sep 17 00:00:00 2001 From: heweidong7 <511650494@qq.com> Date: Wed, 14 Aug 2024 14:33:37 +0800 Subject: [PATCH 72/94] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E7=9B=91=E8=A7=86=E5=99=A8=E5=9F=BA=E7=B1=BBOptimizer?= =?UTF-8?q?Mon=EF=BC=8C=E5=A2=9E=E5=8A=A0MegatronFP32OptimizerMon=E7=B1=BB?= =?UTF-8?q?=E7=94=A8=E4=BA=8E=E6=94=AF=E6=8C=81=E9=9D=9E=E6=B7=B7=E5=90=88?= =?UTF-8?q?=E7=B2=BE=E5=BA=A6=E4=BC=98=E5=8C=96=E5=99=A8=EF=BC=8C=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E5=AF=B9=E5=BA=94README=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/kj600/README.md | 18 +++-- .../accuracy_tools/kj600/kj600/module_hook.py | 41 +++++----- .../kj600/kj600/optimizer_collect.py | 79 ++++++++++++++----- 3 files changed, 91 insertions(+), 47 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index 81d5be8b618..b41c0460fc9 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -82,12 +82,13 @@ cd kj600/ python3 anomaly_analyse.py -d $KJ600_OUTPUT_DIR/anomaly_detected ``` 支持以下参数配置 -| 字段名字 | 解释 | 是否必选释 | -| ------ | -------- | -------- | -|-d 或 --data_path| 指定梯度异常落盘文件夹,梯度监控功能输出,一般为$KJ600_OUTPUT_DIR/anomaly_detected。|是 | -|-o 或 --out_path| 排序后的异常落盘文件地址,默认在--data_path路径下落盘一个anomaly_analyse.json文件| 否 | -|-k 或 --topk| 指定保留前topk个异常,默认为8| 否 | -|-s 或 --step_list| 指定分析的step范围,默认为[]| 否 | + +| 字段名字 | 解释 | 是否必选 | +|------------------|-------------------------------------------------------------|------| +| -d 或 --data_path | 指定梯度异常落盘文件夹,梯度监控功能输出,一般为$KJ600_OUTPUT_DIR/anomaly_detected。 | 是 | +| -o 或 --out_path | 排序后的异常落盘文件地址,默认在--data_path路径下落盘一个anomaly_analyse.json文件 | 否 | +| -k 或 --topk | 指定保留前topk个异常,默认为8 | 否 | +| -s 或 --step_list | 指定分析的step范围,默认为[] | 否 | ## 详细配置 @@ -177,7 +178,10 @@ python3 anomaly_analyse.py -d $KJ600_OUTPUT_DIR/anomaly_detected 如果不是Megatron-LM的训练框架, 可以设置对应的梯度累积步数grad_acc_steps。 - 如果要监控混合精度优化器的动量和方差, 需要在混合精度优化器构造后加入如下代码。 目前只支持Megatron_DistributedOptimizer, 使用bf16或者fp16混合精度时开启分布式优化器。 或者Megatron_Float16OptimizerWithFloat16Params, 使用bf16或者fp16混合精度选项并且不开启分布式优化器。 + 如果要监控优化器的动量和方差,需要在优化器构造后加入如下代码。 目前支持Megatron实现的优化器: + - Megatron_FP32OptimizerMon,普通优化器。 + - Megatron_Float16OptimizerWithFloat16Params, 使用bf16或者fp16混合精度选项并且不开启分布式优化器。 + - Megatron_DistributedOptimizer, 使用bf16或者fp16混合精度时开启分布式优化器。 ``` model, optimizer, opt_param_scheduler = setup_model_and_optimizer( diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index c9b1c541020..08d4110bb97 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -8,7 +8,7 @@ import torch import torch.distributed as dist from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook from kj600.module_spec_verifier import get_config, validate_config_spec -from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon +from kj600.optimizer_collect import OptimizerMon, print_rank_0, OptimizerMonFactory from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer from kj600.anomaly_detect import AnomalyScanner, AnomalyDataFactory, SummaryWriterWithAD @@ -114,11 +114,11 @@ class TrainerMon: self.xy_distribution = self.config.get('xy_distribution', False) if not self.xy_distribution: print_rank_0("> module input/output input_grad/output_grad is not monitored. ") - + # backward hook cause megatron-lm pipeline parallel schedule assert exception. # TBD: backward hook cause output tensor is view of some base tensor. root cause invesigation pending. - self.forward_only = self.config.get('forward_only', False) - if self.forward_only: + self.forward_only = self.config.get('forward_only', False) + if self.forward_only: print_rank_0("> only module forward is monitored. ") self.ur_distribution = self.config.get('ur_distribution', False) @@ -147,14 +147,14 @@ class TrainerMon: alert_setting = self.config.get('alert', {"rules":[]}) self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"]) - + anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None self.optimizer_hooked = False self.vpp = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] - + if dist.is_initialized(): rank = dist.get_rank() tensorboard_dir = os.path.join(output_base_dir, f"{cur_time}-rank{rank}-{unique_id}") @@ -170,10 +170,10 @@ class TrainerMon: # 初始化AnomalyData工厂 self.anomaly_data_factory = AnomalyDataFactory(rank, pp_stage, group_mates) self.summary_writer = SummaryWriterWithAD( - tensorboard_dir, - self.alert_rules, - unique_id, - anomaly_inform, + tensorboard_dir, + self.alert_rules, + unique_id, + anomaly_inform, self.anomaly_data_factory) # 初始化anomaly deteted文件目录 self.anomaly_data_writer = AnomalyDataWriter( @@ -203,10 +203,10 @@ class TrainerMon: def __del__(self): if hasattr(self, "summary_writer"): self.summary_writer.close() - + @staticmethod def set_wrapped_optimizer(_wrapped_optimizer): - MixPrecsionOptimizerMon.set_wrapped_optimizer(_wrapped_optimizer) + OptimizerMon.set_wrapped_optimizer(_wrapped_optimizer) @staticmethod def adhoc_check(target_tensor:torch.tensor, module_name:str, tensor_name:str, rank_list, ops_list): @@ -246,7 +246,7 @@ class TrainerMon: def monitor_gnorm_with_ad(self, model, grad_acc_steps, process_group=None): self.micro_batch_number = grad_acc_steps - + self._hook_weights(model) self.hook_optimizer() @@ -267,7 +267,7 @@ class TrainerMon: continue metrics[key] = param_tensor[name] return metrics - + def generate_cc_metrics(self, cc_name, cc_tensor): metrics = defaultdict(dict) rank = dist.get_rank() if dist.is_initialized() else None @@ -324,11 +324,13 @@ class TrainerMon: self._smallest_rank_print("> Used communication ops and corresponding stack") self._smallest_rank_print(json.dumps({k:[i.split(';') for i in v] for k,v in self.cc_logged_stack.items()}, indent=4)) raise Exception("exit after first step when print cc stack") - - context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self, - optimizer, self.param2name) - + mv_result = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name) + context.param_exp_avg = mv_result.exp_avg + context.param_exp_avg_sq = mv_result.exp_avg_sq + context.param_adam_update = mv_result.adam_update + context.param_adam_ratio = mv_result.adam_ratio + rank = dist.get_rank() if dist.is_initialized() else None for param, name in self.param2name.items(): if "params_effrank" in self.config and name in self.config["params_effrank"]: @@ -544,8 +546,7 @@ class TrainerMon: self.module_struct = { prefix + f"{module_name}": {} for module_name, _ in model_chunk.named_modules()} return - + for index, model_chunk in enumerate(model): vpp_stage = index if self.vpp else 0 register_hooks(model_chunk, vpp_stage=vpp_stage) - \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py index 285f17ca6dc..1188fd154c4 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py @@ -1,7 +1,7 @@ -from collections import defaultdict +from abc import ABC, abstractmethod +from collections import defaultdict, namedtuple import torch import torch.distributed as dist -from kj600.visualizer import HeatmapVisualizer def print_rank_0(message, debug=False, force=False): @@ -12,20 +12,29 @@ def print_rank_0(message, debug=False, force=False): print(message) -class MixPrecsionOptimizerMon: +MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio")) + + +class OptimizerMon(ABC): wrapped_optimizer = None + @classmethod + def set_wrapped_optimizer(cls, wrapped_optimizer): + cls.wrapped_optimizer = wrapped_optimizer + + @abstractmethod + def fetch_mv(self, monitor, torch_opt, params2name): + pass + + +class MixPrecisionOptimizerMon(OptimizerMon): def __init__(self) -> None: self.fp16_to_fp32_param = {} - @staticmethod - def set_wrapped_optimizer(_wrapped_optimizer): - MixPrecsionOptimizerMon.wrapped_optimizer = _wrapped_optimizer - # parameter tensors we want to monitor and their names are in params2name_dict # base_optimizer is pytorch optimizer, wrapped_optimizer is a normal object with base_optimizer def fetch_mv(self, monitor, torch_opt, params2name): - mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer + mix_prec_opt = self.wrapped_optimizer if not self.fp16_to_fp32_param and mix_prec_opt is not None: for fp16_group, fp32_group in zip(mix_prec_opt.float16_groups, mix_prec_opt.fp32_from_float16_groups): @@ -42,7 +51,7 @@ class MixPrecsionOptimizerMon: for param, name in params2name.items(): if param in self.fp16_to_fp32_param: param = self.fp16_to_fp32_param[param] - + if param in torch_opt.state: exp_avg = torch_opt.state[param]["exp_avg"] exp_avg_sq = torch_opt.state[param]["exp_avg_sq"] @@ -56,35 +65,65 @@ class MixPrecsionOptimizerMon: ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq) monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name]) monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) - return exp_avg_dict, exp_avg_sq_dict, update_dict, ratio_dict + return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict) -class MegatronDistributedOptimizerMon(MixPrecsionOptimizerMon): +class MegatronDistributedOptimizerMon(MixPrecisionOptimizerMon): def fetch_mv(self, monitor, torch_opt, params2name): - mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer - if not (hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt, "shard_fp32_from_float16_groups")): - raise Exception("megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, \ - if not, please check megatron-lm version") + mix_prec_opt = self.wrapped_optimizer + if not (hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt, + "shard_fp32_from_float16_groups")): + raise Exception("megatron distributed optimizer should have model_float16_groups " + "and shard_fp32_from_float16_groups, if not, please check megatron-lm version") if not self.fp16_to_fp32_param and mix_prec_opt is not None: - for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups, mix_prec_opt.shard_fp32_from_float16_groups): + for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups, + mix_prec_opt.shard_fp32_from_float16_groups): for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group): self.fp16_to_fp32_param[fp16_param] = shard_fp32_param return self._fetch_mv_in_adam(params2name, torch_opt, monitor) -class DummyOptimizerMon(MixPrecsionOptimizerMon): +class MegatronFP32OptimizerMon(OptimizerMon): + def fetch_mv(self, monitor, torch_opt, params2name): + exp_avg_dict = defaultdict(float) + exp_avg_sq_dict = defaultdict(float) + update_dict = defaultdict() + ratio_dict = defaultdict() + + for param, name in params2name.items(): + if param in torch_opt.state: + exp_avg = torch_opt.state[param]["exp_avg"] + exp_avg_sq = torch_opt.state[param]["exp_avg_sq"] + if monitor.mv_distribution: + exp_avg_dict[name] = exp_avg + exp_avg_sq_dict[name] = exp_avg_sq + if monitor.mg_direction: + exp_avg_dict[name] = exp_avg + if monitor.ur_distribution: + update_dict[name] = exp_avg / (torch.sqrt(exp_avg_sq) + torch_opt.defaults['eps']) + ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq) + monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name]) + monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) + return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict) + + +class DummyOptimizerMon(OptimizerMon): def fetch_mv(self, monitor, torch_opt, params2name): return None, None, None, None class OptimizerMonFactory: @staticmethod - def create_optimizer_mon(opt_ty:str): + def create_optimizer_mon(opt_ty: str): if opt_ty == "Megatron_Float16OptimizerWithFloat16Params": - return MixPrecsionOptimizerMon() + return MixPrecisionOptimizerMon() if opt_ty == "Megatron_DistributedOptimizer": return MegatronDistributedOptimizerMon() + if opt_ty == "Megatron_FP32Optimizer": + return MegatronFP32OptimizerMon() if opt_ty is None or opt_ty == "unknown": return DummyOptimizerMon() - raise Exception("opt_ty should be Megatron_Float16OptimizerWithFloat16Params or Megatron_DistributedOptimizer or None or unknown") + raise Exception( + "opt_ty should be Megatron_Float16OptimizerWithFloat16Params or Megatron_DistributedOptimizer or " + "Megatron_FP32Optimizer or None or unknown") -- Gitee From 9b39f4df52ab3c63185ce9aeb6f91e210eca5958 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 06:59:34 +0000 Subject: [PATCH 73/94] unreduced wgrad with grad_acc hook --- debug/accuracy_tools/kj600/kj600/module_hook.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 8243796fdb6..708770f9429 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -607,21 +607,20 @@ class TrainerMon: context = self.grad_context @torch.no_grad - def param_hook(grad, context_dict, param, key): + def param_hook(*args, context_dict, param, key): param.micro_step += 1 if param.micro_step == self.micro_batch_number: param.micro_step = 0 if self.params_have_main_grad: - context_dict[key] = (param.main_grad+grad).clone() - elif param.grad is not None: - context_dict[key] = (param.grad+grad).clone() + context_dict[key] = param.main_grad.clone() else: - context_dict[key] = grad.clone() - + context_dict[key] = param.grad.clone() for param, name in self.param2name.items(): key = get_summary_writer_tag_name(name, 'acc_grad', self.rank) setattr(param, 'micro_step', 0) - param.register_hook(partial(param_hook, context_dict=context.acc, param=param, key=key)) + param_tmp = param.expand_as(param) + grad_acc = param_tmp.grad_fn.next_functions[0][0] + grad_acc.register_hook(partial(param_hook, context_dict=context.acc, param=param, key=key)) self.weight_hooked = True -- Gitee From 76ffba74c92cd38dd74d6c77348fc045a25c89ae Mon Sep 17 00:00:00 2001 From: heweidong7 <511650494@qq.com> Date: Wed, 14 Aug 2024 15:33:56 +0800 Subject: [PATCH 74/94] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=EF=BC=9A=E5=A2=9E=E5=8A=A0step=E9=BB=98?= =?UTF-8?q?=E8=AE=A4=E5=8F=96=E5=80=BC=E6=8F=8F=E8=BF=B0=E4=B8=8E=E5=8F=96?= =?UTF-8?q?=E5=80=BC=E7=A4=BA=E4=BE=8B=EF=BC=9B=E5=8F=AA=E8=AF=BB=E5=8F=98?= =?UTF-8?q?=E9=87=8F=E4=BD=BF=E7=94=A8.get=E6=96=B9=E6=B3=95=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/kj600/README.md | 2 +- .../kj600/kj600/optimizer_collect.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index b41c0460fc9..bf3f80b5a3c 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -88,7 +88,7 @@ python3 anomaly_analyse.py -d $KJ600_OUTPUT_DIR/anomaly_detected | -d 或 --data_path | 指定梯度异常落盘文件夹,梯度监控功能输出,一般为$KJ600_OUTPUT_DIR/anomaly_detected。 | 是 | | -o 或 --out_path | 排序后的异常落盘文件地址,默认在--data_path路径下落盘一个anomaly_analyse.json文件 | 否 | | -k 或 --topk | 指定保留前topk个异常,默认为8 | 否 | -| -s 或 --step_list | 指定分析的step范围,默认为[] | 否 | +| -s 或 --step_list | 指定分析的step范围,如[1,2,3],默认为[],表示使用全部step | 否 | ## 详细配置 diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py index 1188fd154c4..0aea7ff0455 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py @@ -3,6 +3,8 @@ from collections import defaultdict, namedtuple import torch import torch.distributed as dist +from kj600.utils import print_warn_log + def print_rank_0(message, debug=False, force=False): if dist.is_initialized(): @@ -53,8 +55,12 @@ class MixPrecisionOptimizerMon(OptimizerMon): param = self.fp16_to_fp32_param[param] if param in torch_opt.state: - exp_avg = torch_opt.state[param]["exp_avg"] - exp_avg_sq = torch_opt.state[param]["exp_avg_sq"] + state_param = torch_opt.state.get(param, None) + exp_avg = state_param.get("exp_avg", None) + exp_avg_sq = state_param.get("exp_avg_sq", None) + if exp_avg is None or exp_avg_sq is None: + print_warn_log(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.") + continue if monitor.mv_distribution: exp_avg_dict[name] = exp_avg exp_avg_sq_dict[name] = exp_avg_sq @@ -93,8 +99,12 @@ class MegatronFP32OptimizerMon(OptimizerMon): for param, name in params2name.items(): if param in torch_opt.state: - exp_avg = torch_opt.state[param]["exp_avg"] - exp_avg_sq = torch_opt.state[param]["exp_avg_sq"] + state_param = torch_opt.state.get(param, None) + exp_avg = state_param.get("exp_avg", None) + exp_avg_sq = state_param.get("exp_avg_sq", None) + if exp_avg is None or exp_avg_sq is None: + print_warn_log(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.") + continue if monitor.mv_distribution: exp_avg_dict[name] = exp_avg exp_avg_sq_dict[name] = exp_avg_sq -- Gitee From 402bf9c5e25d32e690eafad9cc4f64554d06b13d Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 08:37:15 +0000 Subject: [PATCH 75/94] update readme --- debug/accuracy_tools/kj600/README.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index e28f0d1863f..b9cbbd0824b 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -23,8 +23,8 @@ 方式一:下载源码安装 ``` -git clone https://gitee.com/qiangge123a/att.git -cd kj600 +git clone -b poc https://gitee.com/ascend/mstt.git +cd mstt/debug/accuracy_tools/kj600 pip install . ``` @@ -47,6 +47,7 @@ from kj600.module_hook import TrainerMon model, optimizer, opt_param_scheduler = setup_model_and_optimizer( model_provider, model_type) # 模型、优化器初始化后插入工具代码 +hooker = TrainerMon("./monitor_config.json", process_group=mpu.get_pipeline_model_parallel_group(), params_have_main_grad=True) hooker.monitor_gnorm_with_ad( model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size, optimizer=optimizer, dp_group=mpu.get_data_parallel_group(), tp_group=mpu.get_tensor_model_parallel_group()) ``` @@ -94,6 +95,14 @@ grad mean is in consisten between unreduced grad and reduced grad monitord. python kj600/unittest/test_monitor.py -h ``` ### 梯度异常时序判断 +0. 训练前配置相关参数 +工具支持自动判断训练过程中的梯度异常,需要在配置文件中设置alert相关字段。`AnomalyTurbulence`会将当前数值与历史均值比较,如果相对偏差超过阈值,会在打屏信息中提示用户。如果打开`dump`选项,则会将异常梯度相关信息落盘,用于后续时序判断。 +```json + "alert": { + "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}], + "dump": true + }, +``` 1. 进入工具路径启动脚本: ```shell cd kj600/ @@ -127,7 +136,8 @@ python3 anomaly_analyse.py -d $KJ600_OUTPUT_DIR/anomaly_detected "cc_distribution": {"enable":true, "cc_codeline":[]}, "alert": { "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}], - "inform": {"recipient": "database", "connection_str": "mysql+pymysql://username:password@host:port/database"} + "inform": {"recipient": "database", "connection_str": "mysql+pymysql://username:password@host:port/database"}, + "dump": true }, "ops": ["min", "max", "norm", "zeros", "id"], "eps": 1e-8 -- Gitee From d9208a9b83ee2e78683d1dc7c0a63fa717f0f1c6 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 08:53:17 +0000 Subject: [PATCH 76/94] move init to base class --- .../kj600/kj600/anomaly_detect.py | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 1ac30c6b9f9..196eb631996 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -139,8 +139,14 @@ class GradAnomalyData: (str(self.tag_name), "_step_", str(self.step), "_call_" , str(self.call_id))) class BaseWriterWithAD: - def __init__(self): - pass + def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6): + self.tag2scalars = defaultdict(list) + self.ad_rules = ad_rules + self.job_id = job_id + self.anomaly_inform = anomaly_inform + self.anomaly_factory = anomaly_factory + self.anomalies = [] + self.ndigits = ndigits def _add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): new_avg = avg = scalar_value @@ -175,13 +181,7 @@ class BaseWriterWithAD: class CSVWriterWithAD(BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6): - self.tag2scalars = defaultdict(list) - self.ad_rules = ad_rules - self.job_id = job_id - self.anomaly_inform = anomaly_inform - self.anomaly_factory = anomaly_factory - self.anomalies = [] - self.ndigits = ndigits + super().__init__(path, ad_rules, job_id, anomaly_inform, anomaly_factory, ndigits) self.log_dir = path create_directory(path) @@ -214,14 +214,8 @@ class CSVWriterWithAD(BaseWriterWithAD): class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6): + super(SummaryWriter, self).__init__(path, ad_rules, job_id, anomaly_inform, anomaly_factory, ndigits) super().__init__(path) - self.tag2scalars = defaultdict(list) - self.ad_rules = ad_rules - self.job_id = job_id - self.anomaly_inform = anomaly_inform - self.anomaly_factory = anomaly_factory - self.anomalies = [] - self.ndigits = ndigits def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): super()._add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) -- Gitee From 2f65d2822003b4458310cadcbdc609eb7ba1159a Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 09:05:15 +0000 Subject: [PATCH 77/94] fix not monitored rank --- debug/accuracy_tools/kj600/kj600/module_hook.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index a4eebbe2664..284e48d6071 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -174,6 +174,7 @@ class TrainerMon: tensorboard_dir = os.path.join(output_base_dir, f"{cur_time}-{unique_id}") pp_stage = 0 group_mates = [0] + self.rank = rank # 初始化AnomalyData工厂 self.anomaly_data_factory = AnomalyDataFactory(rank, pp_stage, group_mates) if alert_setting.get('dump', False) else None @@ -205,8 +206,6 @@ class TrainerMon: self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.micro_batch_number = 1 - self.step = -1 - self.rank = dist.get_rank() if dist.is_initialized() else None self.weight_hooked = False self.optimizer_hooked = False @@ -252,7 +251,7 @@ class TrainerMon: def hook_modules(self, model:torch.nn.Module, grad_acc_steps): # fwd=0, bkd=1 # targets is module name list like ["xx.xxx1", "xxx.xxx2"] which can be obtained when first run. - if (dist.is_initialized() and dist.get_rank() not in self.module_rank_list): + if self.module_rank_list and (self.rank not in self.module_rank_list): return if not isinstance(model, list): -- Gitee From 45ccb8de8fc10fb0cdc56ca4ed32ef35c246ebeb Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 09:07:34 +0000 Subject: [PATCH 78/94] fix format --- debug/accuracy_tools/kj600/kj600/anomaly_detect.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 196eb631996..d6b6e347d45 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -163,7 +163,7 @@ class BaseWriterWithAD: self.anomaly_inform.run(exception_message, self.job_id) if self.anomaly_factory: - self.anomalies.append(self.anomaly_factory.create(tag, exception_message, global_step)) + self.anomalies.append(self.anomaly_factory.create(tag, exception_message, global_step)) def _ad(self, scalar_value, history): return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) @@ -188,7 +188,6 @@ class CSVWriterWithAD(BaseWriterWithAD): self.context_dict = defaultdict(list) self.header = [] - def write_csv(self, prefix, step): if len(self.context_dict) == 0: return -- Gitee From d005f21f268f35c77f6e3afccdb775c8521c6301 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 09:11:13 +0000 Subject: [PATCH 79/94] update ut --- .../kj600/kj600/unittest/cc_utils.py | 83 ------------------- .../kj600/kj600/unittest/test_grad_monitor.py | 16 +++- 2 files changed, 14 insertions(+), 85 deletions(-) delete mode 100644 debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py diff --git a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py b/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py deleted file mode 100644 index 10c742e7b54..00000000000 --- a/debug/accuracy_tools/kj600/kj600/unittest/cc_utils.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -from functools import partial -import torch -from torch import distributed as dist -from torch import nn -try: - import torch_npu - BACKEND = 'hccl' - DEVICE = 'npu' -except: - BACKEND = 'nccl' - DEVICE = 'cuda' - -from kj600.features import square_sum, get_max, get_min, get_zeros -from kj600.module_hook import CommunicationContext - - -OP_FUNCS = { - "min": get_min, - "max": get_max, - "norm": square_sum, - "zeros": partial(get_zeros, eps=1e-8) -} - -def ddp_setup(rank, world_size): - os.environ["MASTER_ADDR"] = "127.0.0.1" - os.environ["MASTER_PORT"] = "6006" - dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) - -def reset_context(context): - if isinstance(context, CommunicationContext): - context.reset() - elif isinstance(context, dict): - for op, v in context.items(): - v.reset() - -def wrap_reset(func): - def reset_and_test(*args, **kwargs): - print(f"testing {func.__name__}") - reset_context(args[0]) - res = func(*args, **kwargs) - return res - - return reset_and_test - -def assert_empty(data): - assert len(data) == 0, f'data is not empty as expected' - -def assert_nonempty(data): - assert len(data) != 0, f'data is empty' - -def assert_equal(a, b, rank, op_name=None, tag=None): - if a.dim() == 0: - assert a==b, f'inequal in rank {rank}: {a}, {b}, {op_name}, {tag}' - else: - assert torch.equal(a,b), f'inequal in rank {rank}: {a},{b}' - -def assert_inequal(a, b, rank): - if a.dim() == 0: - assert a!=b, f'equal in rank {rank}: {a},{b}' - else: - assert not torch.equal(a,b), f'equal in rank {rank}: {a},{b}' - -def assert_context(data, src, rank): - if len(src) == 0: - assert_empty(data) - else: - assert_nonempty(data) - - for op_name, tensors in data.items(): - for tag, tensor in tensors.items(): - prefix, idx = tag.split('_') - idx = int(idx) - assert_equal(tensor, OP_FUNCS[op_name](src[prefix][idx]), rank, op_name, tag) - - -class Model(nn.Module): - def __init__(self): - super(Model, self).__init__() - self.layer = nn.Linear(2,2) - - def forward(self, x): - return self.layer(x) \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index 66afce8b72c..abbb03e18e8 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -8,10 +8,22 @@ import torch.multiprocessing as mp from torch.utils.data import DistributedSampler, DataLoader, Dataset from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist -# import torch_npu +try: + import torch_npu + BACKEND = 'hccl' + DEVICE = 'npu' +except: + BACKEND = 'nccl' + DEVICE = 'cuda' from kj600.module_hook import TrainerMon -from kj600.unittest.cc_utils import ddp_setup, BACKEND, DEVICE + + +def ddp_setup(rank, world_size): + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = "6006" + dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) + class ToyOptimizer: def __init__(self, opt, grad_clip=1., mp_group=None): -- Gitee From 9627a20306b14da96e9b473d425033b40d3cf484 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 09:35:12 +0000 Subject: [PATCH 80/94] fix ut --- .../kj600/kj600/unittest/config_grad_monitor.json | 3 ++- .../kj600/kj600/unittest/test_grad_monitor.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json b/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json index 293776e34ec..28735dad88b 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json +++ b/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json @@ -1,6 +1,6 @@ { "targets": { - "fc": { + "layers": { "input": "tuple[1]:0", "output": "tensor", "input_grad": "tuple[1]:0", @@ -8,6 +8,7 @@ } }, "wg_distribution": true, + "print_struct": false, "eps": 1e-8, "format": "csv", "ops": [ diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py index abbb03e18e8..4140fabba02 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py @@ -55,11 +55,14 @@ class ToyOptimizer: class Model(torch.nn.Module): def __init__(self): super().__init__() - self.fc = torch.nn.Linear(2, 2, bias=False) + self.layers = torch.nn.Sequential( + torch.nn.Linear(2, 2, bias=False), + torch.nn.Linear(2, 2, bias=False) + ) self.relu = torch.nn.ReLU() def forward(self, x): - return self.relu(self.fc(x)) + return self.relu(self.layers(x)) class ToyDataset(Dataset): def __init__(self): @@ -131,7 +134,7 @@ def train(rank, world_size, res): pt_opt = torch.optim.AdamW(model.parameters(), lr=0.0001) optimizer = ToyOptimizer(pt_opt, grad_clip, mp_group) - hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), False) # or opt_ty=Megatron_DistributedOptimizer + hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), params_have_main_grad=False) # or opt_ty=Megatron_DistributedOptimizer hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc, optimizer=optimizer) file_path = hooker.summary_writer.log_dir -- Gitee From 31775fe27edf70a51adf1e703a2ad4c458bfc884 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 09:40:17 +0000 Subject: [PATCH 81/94] rm ut --- .../kj600/unittest/config_grad_monitor.json | 20 -- .../kj600/kj600/unittest/test_grad_monitor.py | 255 ------------------ 2 files changed, 275 deletions(-) delete mode 100644 debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json delete mode 100644 debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json b/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json deleted file mode 100644 index 28735dad88b..00000000000 --- a/debug/accuracy_tools/kj600/kj600/unittest/config_grad_monitor.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "targets": { - "layers": { - "input": "tuple[1]:0", - "output": "tensor", - "input_grad": "tuple[1]:0", - "output_grad": "tuple[1]:0" - } - }, - "wg_distribution": true, - "print_struct": false, - "eps": 1e-8, - "format": "csv", - "ops": [ - "min", - "max", - "norm", - "mean" - ] -} \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py deleted file mode 100644 index 4140fabba02..00000000000 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_grad_monitor.py +++ /dev/null @@ -1,255 +0,0 @@ -import torch -import json -import os -import shutil -import pandas as pd -import unittest -import torch.multiprocessing as mp -from torch.utils.data import DistributedSampler, DataLoader, Dataset -from torch.nn.parallel import DistributedDataParallel as DDP -import torch.distributed as dist -try: - import torch_npu - BACKEND = 'hccl' - DEVICE = 'npu' -except: - BACKEND = 'nccl' - DEVICE = 'cuda' - -from kj600.module_hook import TrainerMon - - -def ddp_setup(rank, world_size): - os.environ["MASTER_ADDR"] = "127.0.0.1" - os.environ["MASTER_PORT"] = "6006" - dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) - - -class ToyOptimizer: - def __init__(self, opt, grad_clip=1., mp_group=None): - self.opt = opt - self.grad_clip = grad_clip - self.mp_group = mp_group - - def step(self): - total_norm = 0. - for param_group in self.opt.param_groups: - for parameter in param_group['params']: - total_norm += parameter.grad.norm()**2 - - if self.mp_group: - dist.all_reduce(total_norm, group = self.mp_group) - - total_norm = total_norm ** 0.5 - coef = self.grad_clip / total_norm - if coef < 1.: - for param_group in self.opt.param_groups: - for parameter in param_group['params']: - parameter.grad *= coef - self.opt.step() - return total_norm - - def zero_grad(self): - self.opt.zero_grad() - -class Model(torch.nn.Module): - def __init__(self): - super().__init__() - self.layers = torch.nn.Sequential( - torch.nn.Linear(2, 2, bias=False), - torch.nn.Linear(2, 2, bias=False) - ) - self.relu = torch.nn.ReLU() - - def forward(self, x): - return self.relu(self.layers(x)) - -class ToyDataset(Dataset): - def __init__(self): - self.data = torch.randn(32, 2, requires_grad=True) - self.labels = torch.randint(low=0, high=2, size=(32,)) - def __len__(self): - return len(self.labels) - def __getitem__(self, idx): - return idx, self.data[idx], self.labels[idx] - -def get_config(): - with open(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), 'r') as file: - config_test = json.load(file) - return config_test - -def clean_output(): - folder_path = os.environ.get("KJ600_OUTPUT_DIR") - if os.path.exists(folder_path): - shutil.rmtree(folder_path) - - -def get_grad(model, res): - for name, parameter in model.named_parameters(): - if name not in res: - res[name] = [] - res[name].append(parameter.grad.norm()) - -def init_parallel(model_parallel, data_parallel): - rank = dist.get_rank() - world_size = dist.get_world_size() - - mp_group = None - dp_group = None - - num_mp_group = world_size // model_parallel - num_dp_group = world_size // data_parallel - - for i in range(num_mp_group): - start = i * model_parallel - end = (i+1) * model_parallel - - ranks = range(start, end) - if rank in ranks: - mp_group = dist.new_group(ranks) - print(f'rank {rank} in model parallel group {list(ranks)}') - - for j in range(num_dp_group): - ranks = range(j, model_parallel*data_parallel, model_parallel) - if rank in ranks: - dp_group = dist.new_group(ranks) - print(f'rank {rank} in data parallel group {list(ranks)}') - - return dp_group, mp_group - - -def train(rank, world_size, res): - ddp_setup(rank, world_size) - device = f'{DEVICE}:{rank}' - grad_clip = 0.5 - mbs = 2 - gbs = 16 - model_parallel = 1 - data_parallel = world_size // model_parallel - grad_acc = gbs/mbs//data_parallel - - dp_group, mp_group = init_parallel(model_parallel, data_parallel) - - model = Model().to(device=device) - pt_opt = torch.optim.AdamW(model.parameters(), lr=0.0001) - optimizer = ToyOptimizer(pt_opt, grad_clip, mp_group) - - hooker = TrainerMon(os.path.join(os.path.dirname(__file__),"config_grad_monitor.json"), params_have_main_grad=False) # or opt_ty=Megatron_DistributedOptimizer - hooker.monitor_gnorm_with_ad(model=model, grad_acc_steps=grad_acc, optimizer=optimizer) - file_path = hooker.summary_writer.log_dir - - train_ds = ToyDataset() - sampler = DistributedSampler(train_ds, num_replicas=data_parallel, rank=dist.get_group_rank(dp_group, rank)) - train_loader = DataLoader(train_ds, batch_size=mbs, sampler=sampler) - - unreduced = {} - reduced = {} - - micro_step = 0 - step = 0 - total_norms = [] - optimizer.zero_grad() - for (idx, inputs, targets) in train_loader: - micro_step += 1 - - inputs = inputs.to(device) - targets = targets.to(device) - # print(step, micro_step, idx, rank) - # inputs and param torch.float32 -> torch.float16 - # inputs = inputs.half() - - # outputs torch.float32 - outputs = model(inputs) - loss = torch.nn.functional.cross_entropy(outputs, targets) - - loss.backward() - - if micro_step == grad_acc: - micro_step = 0 - get_grad(model, unreduced) - - grad_buffer = torch.hstack([p.grad.flatten() for p in model.parameters()]) - dist.all_reduce(grad_buffer, group=dp_group) - - start = 0 - for p in model.parameters(): - numel = p.numel() - p.grad = grad_buffer.narrow(0,start,numel).reshape(p.shape) - start += numel - - get_grad(model, reduced) - - total_norm = optimizer.step() - total_norms.append(total_norm.item()) - - print(f"step {step} | rank {rank} | total_norm {total_norm.item()} | loss {loss.item()}") - - optimizer.zero_grad() - - step += 1 - - res[rank] = [file_path, step, total_norms, dist.get_process_group_ranks(mp_group), dist.get_process_group_ranks(dp_group)] - -def assert_info(a, b): - assert round(a,5)==round(b,5), f'{a}, {b}' - -def get_csv_files(meta): - res = {} - for rank, m in meta.items(): - logdir = m[0] - step = m[1] - res[rank] = [[pd.read_csv(os.path.join(logdir, f'grad_reduced_{i}.csv')) for i in range(step)], \ - [pd.read_csv(os.path.join(logdir, f'grad_unreduced_{i}.csv')) for i in range(step)]] - - return res - -class TestKj600(unittest.TestCase): - def __init__(self, method_name: str) -> None: - super(TestKj600, self).__init__(method_name) - self.config_test = get_config() - self.res = None - - @classmethod - def setUpClass(self): - os.environ["KJ600_OUTPUT_DIR"] = "./test_kj600_output" - self.world_size = 4 - torch.manual_seed(1234) - with mp.Manager() as manager: - meta = manager.dict() - mp.spawn(train, args=(self.world_size,meta,), nprocs=self.world_size) - self.meta = meta.copy() - - def setUp(self): - self.config_test = get_config() - self.steps = self.meta[0][1] - self.res = get_csv_files(self.meta) - - def test_mean(self): - for rank in range(self.world_size): - dp_group = self.meta[rank][4] - for step in range(self.steps): - mean = 0. - for dp_stage in dp_group: - mean += self.res[dp_stage][1][step].loc[0,'mean'] - reduced_mean = self.res[rank][0][step].loc[0,'mean'] - print(f'checking mean of step {step}, rank {rank}', mean, reduced_mean) - assert_info(mean, reduced_mean) - - def test_gnorm(self): - for rank in range(self.world_size): - mp_group = self.meta[rank][3] - for step in range(self.steps): - total_norm = 0. - for mp_stage in mp_group: - total_norm += self.res[mp_stage][0][step].loc[0,'norm']**2 - print(f'checking total norm of step {step}, rank {rank}', total_norm**0.5, self.meta[rank][2][step]) - assert_info(total_norm, self.meta[rank][2][step]**2) - - @classmethod - def tearDownClass(cls) -> None: - clean_output() - -if __name__ == "__main__": - unittest.main() - - -- Gitee From 526f074cbac2333f85aa5cc84a579751a2769b49 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 09:49:14 +0000 Subject: [PATCH 82/94] clean code --- .../kj600/kj600/anomaly_detect.py | 21 +++++++++--------- .../accuracy_tools/kj600/kj600/module_hook.py | 22 ++++--------------- .../kj600/kj600/module_metric.py | 1 - 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index d6b6e347d45..a870d974ff4 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -1,12 +1,12 @@ import os +import sys import statistics as st from abc import ABC from typing import List +from collections import defaultdict import pandas as pd -import sys -from dataclasses import dataclass, field from torch.utils.tensorboard import SummaryWriter -from collections import defaultdict +from dataclasses import dataclass, field from kj600.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory @@ -71,6 +71,7 @@ class AnomalyDataFactory(ABC): self.group_mates = group_mates self.micro_step = 0 self.vpp_stage = 0 + self.name2callid = {} def set_call_id(self, name2callid): """根据当前GradContext信息更新call_id vpp_stage等信息 @@ -148,7 +149,7 @@ class BaseWriterWithAD: self.anomalies = [] self.ndigits = ndigits - def _add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): + def _add_scalar(self, tag, scalar_value, global_step=None): new_avg = avg = scalar_value if tag in self.tag2scalars: N = len(self.tag2scalars[tag]) @@ -176,8 +177,6 @@ class BaseWriterWithAD: def clear_anomalies(self): self.anomalies.clear() - def _ad(self, scalar_value, history): - return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) class CSVWriterWithAD(BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6): @@ -202,8 +201,8 @@ class CSVWriterWithAD(BaseWriterWithAD): new_data.to_csv(filepath, mode='a+', header=False, index=False) self.context_dict = defaultdict(list) - def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): - super()._add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) + def add_scalar(self, tag, scalar_value, global_step): + super()._add_scalar(tag, scalar_value, global_step) name = tag.split('/')[0] self.context_dict[name].append(round(scalar_value, self.ndigits)) @@ -216,7 +215,7 @@ class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD): super(SummaryWriter, self).__init__(path, ad_rules, job_id, anomaly_inform, anomaly_factory, ndigits) super().__init__(path) - def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): - super()._add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) - return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) + def add_scalar(self, tag, scalar_value, global_step): + super()._add_scalar(tag, scalar_value, global_step) + return super().add_scalar(tag, scalar_value, global_step) \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 284e48d6071..3519cbf38e9 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -52,9 +52,6 @@ class ModuleHookContext: self.format_by_arg[key_name] = target_config[self.module_name][key_name] elif key_name in ['input', 'input_grad']: self.ignore_in = True - # else: - # raise KeyError(f"Missing key: {key_name} of {self.module_name} in config.json") - class OptimizerContext: def __init__(self) -> None: @@ -211,6 +208,8 @@ class TrainerMon: self.optimizer_hooked = False self.param_registered = False self.vpp = False + self.dp_group = None + self.tp_group = None self.param2name = defaultdict(str) self.param_name_call_id = {} @@ -249,8 +248,6 @@ class TrainerMon: TrainerMon.tensor_metrics.stat_insert(target_tensor, ops_list, module_name, tensor_name, rank) def hook_modules(self, model:torch.nn.Module, grad_acc_steps): - # fwd=0, bkd=1 - # targets is module name list like ["xx.xxx1", "xxx.xxx2"] which can be obtained when first run. if self.module_rank_list and (self.rank not in self.module_rank_list): return @@ -270,7 +267,7 @@ class TrainerMon: self.hook_optimizer() return - def _get_wg_metric(self, tag): + def generate_wgrad_metrics(self, tag): grad_dict = {} for param, name in self.param2name.items(): if tag == 'post_grad': @@ -348,14 +345,6 @@ class TrainerMon: self.write_metrics(self.ops, self.summary_writer, fwd_context.actv, step, 'actv') fwd_context.actv.clear() - # for _, bwd_context in self.module_bwd_hook_context_by_module.items(): - # if len(bwd_context.actvgrad) == 0: - # continue - # if not len(bwd_context.actvgrad) == self.micro_batch_number: - # print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") - # self.write_metrics(self.ops, self.summary_writer, bwd_context.actvgrad, step, 'grad_actv') - # bwd_context.actvgrad.clear() - self.write_metrics(self.ops, self.summary_writer, [self.grad_context.actv], step, 'grad_actv') def write_grad_tb(self, step): @@ -390,7 +379,7 @@ class TrainerMon: metric_dict[metric_name] = get_metrics(metric_name, self.grad_context.acc, self.eps) self.grad_context.acc_metric = [metric_dict] - wg_metric_dict = self._get_wg_metric(tag='post_grad') + wg_metric_dict = self.generate_wgrad_metrics(tag='post_grad') self.grad_context.post.append(wg_metric_dict) for param, name in self.param2name.items(): @@ -414,8 +403,6 @@ class TrainerMon: tbtag_tensor_map.update(self.generate_param_metrics('exp_avg_sq', context.param_exp_avg_sq)) if self.mg_direction: tbtag_tensor_map.update(self.generate_param_metrics('mg_direction', context.param_mg_direction)) - # if not tbtag_tensor_map: - # return metric_dict = {} for metric_name in self.ops: metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) @@ -588,7 +575,6 @@ class TrainerMon: if context.micro_step == 0 and context.actvgrad: print_warn_log(f"actvgrad context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.") context.actvgrad.clear() - # context.actvgrad.append(metric_dict) for metric_name in self.ops: self.grad_context.actv[metric_name].update(get_metrics(metric_name, tbtag_tensor_map, self.eps)) diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 8418da85e9b..3918b218bac 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -186,7 +186,6 @@ def write_metrics_csv(ops, summary_writer, metric_value, step, prefix=''): fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) except KeyError as e: - print(e) raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") from e if not summary_writer.header: -- Gitee From d681dc8bac166117181c136a703fdb4ac0f35b44 Mon Sep 17 00:00:00 2001 From: qianggee Date: Wed, 14 Aug 2024 09:59:15 +0000 Subject: [PATCH 83/94] clean code --- .../kj600/kj600/anomaly_detect.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index a870d974ff4..179c767325b 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -4,9 +4,9 @@ import statistics as st from abc import ABC from typing import List from collections import defaultdict +from dataclasses import dataclass, field import pandas as pd from torch.utils.tensorboard import SummaryWriter -from dataclasses import dataclass, field from kj600.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory @@ -82,7 +82,7 @@ class AnomalyDataFactory(ABC): """如果检查出异常, 调用当前接口生成GradAnomalyData实例 """ param_name = tag_name.split('/')[0] - call_id = self.name2callid[param_name] + call_id = self.name2callid.get(param_name,-1) if 'vpp' in param_name: vpp_stage = int(param_name.lstrip('vpp').split(':')[0]) else: @@ -149,6 +149,14 @@ class BaseWriterWithAD: self.anomalies = [] self.ndigits = ndigits + def get_anomalies(self): + """返回已检测到的异常列表 + """ + return self.anomalies + + def clear_anomalies(self): + self.anomalies.clear() + def _add_scalar(self, tag, scalar_value, global_step=None): new_avg = avg = scalar_value if tag in self.tag2scalars: @@ -168,15 +176,7 @@ class BaseWriterWithAD: def _ad(self, scalar_value, history): return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) - - def get_anomalies(self): - """返回已检测到的异常列表 - """ - return self.anomalies - def clear_anomalies(self): - self.anomalies.clear() - class CSVWriterWithAD(BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6): -- Gitee From ab14f3f3896dce3bcd247de7a17cc0a61ea22467 Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 15 Aug 2024 03:05:44 +0000 Subject: [PATCH 84/94] avoid divide 0 --- .../kj600/kj600/unittest/test_monitor.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py index b15e7474246..ddea3244f5c 100644 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py +++ b/debug/accuracy_tools/kj600/kj600/unittest/test_monitor.py @@ -84,7 +84,14 @@ def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel): def assert_equal(a, b): - rel_diff = abs(a/b-1) + if b == 0 or a == 0: + return + if b == 0: + rel_diff = a + elif a == 0: + rel_diff = b + else: + rel_diff = abs(a/b-1) assert rel_diff<0.01, f'{a}, {b}, {rel_diff}' @@ -124,6 +131,11 @@ if __name__ == "__main__": parser.add_argument('--sequence_parallel', '-s', action="store_true", default=False, help='whether sequence parallel is enabled. Add -s to store true') args = parser.parse_args() + + assert args.tp_size > 0, 'if tp not enabled, set tp_size = 1' + assert args.dp_size > 0, 'if tp not enabled, set dp_size = 1' + assert args.pp_size > 0, 'if tp not enabled, set pp_size = 1' + total_norm = parse_logfile(args.logfile) reduced, unreduced = parse_monitor_output(args.monitor_output) -- Gitee From 5637a48873cebbea16d9401485a50d3e5ee1d7db Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 15 Aug 2024 03:07:29 +0000 Subject: [PATCH 85/94] fix regular --- debug/accuracy_tools/kj600/kj600/module_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py index 3918b218bac..ebcf6c400c0 100644 --- a/debug/accuracy_tools/kj600/kj600/module_metric.py +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -13,7 +13,7 @@ def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): def squash_param_name(param_name): name = '' - for pattern in ['(?<=layers.)[d]*.*', '(?<=embedding.).*', 'final.*', 'output.*']: + for pattern in ['(?<=layers.)[\d]*.*', '(?<=embedding.).*', 'final.*', 'output.*']: match = re.findall(pattern, param_name) if match: name += match[0] -- Gitee From a0cf4bdfb5424c6123c12af69a6481e70cf9d9fb Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 15 Aug 2024 03:31:30 +0000 Subject: [PATCH 86/94] add const class --- debug/accuracy_tools/kj600/kj600/anomaly_detect.py | 13 ++++++++++--- debug/accuracy_tools/kj600/kj600/const.py | 4 ++++ debug/accuracy_tools/kj600/kj600/module_hook.py | 6 +++--- 3 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 debug/accuracy_tools/kj600/kj600/const.py diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index 179c767325b..fd5ed2d69e9 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -8,6 +8,7 @@ from dataclasses import dataclass, field import pandas as pd from torch.utils.tensorboard import SummaryWriter from kj600.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory +from kj600.const import Const class ScanRule(ABC): @@ -83,8 +84,8 @@ class AnomalyDataFactory(ABC): """ param_name = tag_name.split('/')[0] call_id = self.name2callid.get(param_name,-1) - if 'vpp' in param_name: - vpp_stage = int(param_name.lstrip('vpp').split(':')[0]) + if Const.vpp in param_name: + vpp_stage = int(param_name.lstrip(Const.vpp).split(Const.vpp_sep)[0]) else: vpp_stage = 0 @@ -197,7 +198,13 @@ class CSVWriterWithAD(BaseWriterWithAD): data_frame.to_csv(filepath, index=False) check_file_valid_writable(filepath) - new_data = pd.DataFrame([[name]+metric_value if 'vpp' not in name else name.lstrip('vpp').split(':')+metric_value for name, metric_value in self.context_dict.items()]) + new_data = [] + for name, metric_value in self.context_dict.items(): + if Const.vpp not in name: + new_data.append([name]+metric_value) + else: + new_data.append(name.lstrip(Const.vpp).split(Const.vpp_sep)+metric_value) + new_data = pd.DataFrame(new_data) new_data.to_csv(filepath, mode='a+', header=False, index=False) self.context_dict = defaultdict(list) diff --git a/debug/accuracy_tools/kj600/kj600/const.py b/debug/accuracy_tools/kj600/kj600/const.py new file mode 100644 index 00000000000..e4198a99422 --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/const.py @@ -0,0 +1,4 @@ + +class Const: + vpp = "vpp" + vpp_sep = ':' \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 3519cbf38e9..c41dcaed721 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -18,6 +18,7 @@ from kj600.anomaly_analyse import AnomalyDataWriter from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_metrics_csv, get_summary_writer_tag_name, TensorMetrics, squash_param_name from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, get_process_group from kj600.utils import print_warn_log, print_info_log, get_param_struct +from kj600.const import Const try: @@ -258,7 +259,7 @@ class TrainerMon: self.micro_batch_number = grad_acc_steps for vpp_stage, model_chunk in enumerate(model): - vpp_stage = f'{vpp_stage}_' if self.vpp else '' + vpp_stage = f'{vpp_stage}{Const.vpp_sep}' if self.vpp else '' targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config['targets'].keys() hooked_count = self._hook_module(targets, model_chunk, vpp_stage) print_rank_0(f"> {hooked_count} out of {len(self.config['targets'])} are monitored.") @@ -271,7 +272,6 @@ class TrainerMon: grad_dict = {} for param, name in self.param2name.items(): if tag == 'post_grad': - pass if self.tp_group and not param_is_not_tensor_parallel_duplicate(param, self.tp_group): continue if self.dp_group and param_is_data_parallel_duplicate(self.dp_group): @@ -491,7 +491,7 @@ class TrainerMon: self._smallest_rank_print('vpp enabled') for vpp_stage, model_chunk in enumerate(model): - prefix = f'vpp{vpp_stage}:' if self.vpp else '' + prefix = f'{Const.vpp}{vpp_stage}{Const.vpp_sep}' if self.vpp else '' for param_name, param in model_chunk.named_parameters(): name = prefix + squash_param_name(param_name) for target in self.config['targets'].keys(): -- Gitee From ed75f8d49584ada85ce0c23d4e691863da8049b8 Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 15 Aug 2024 07:04:28 +0000 Subject: [PATCH 87/94] change dump file mode --- debug/accuracy_tools/kj600/kj600/anomaly_detect.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py index fd5ed2d69e9..ad0ac9a1a22 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -9,6 +9,7 @@ import pandas as pd from torch.utils.tensorboard import SummaryWriter from kj600.utils import print_info_log, check_file_valid_writable, make_file_safety, create_directory from kj600.const import Const +from kj600.file_check import change_mode, FileCheckConst class ScanRule(ABC): @@ -196,6 +197,7 @@ class CSVWriterWithAD(BaseWriterWithAD): make_file_safety(filepath) data_frame = pd.DataFrame(columns=self.header) data_frame.to_csv(filepath, index=False) + change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY) check_file_valid_writable(filepath) new_data = [] @@ -221,6 +223,7 @@ class SummaryWriterWithAD(SummaryWriter, BaseWriterWithAD): def __init__(self, path, ad_rules, job_id, anomaly_inform=False, anomaly_factory=None, ndigits=6): super(SummaryWriter, self).__init__(path, ad_rules, job_id, anomaly_inform, anomaly_factory, ndigits) super().__init__(path) + change_mode(path, FileCheckConst.DATA_DIR_AUTHORITY) def add_scalar(self, tag, scalar_value, global_step): super()._add_scalar(tag, scalar_value, global_step) -- Gitee From a84d8233def5f5850db784b747a179454d74b316 Mon Sep 17 00:00:00 2001 From: heweidong7 <511650494@qq.com> Date: Thu, 15 Aug 2024 17:06:25 +0800 Subject: [PATCH 88/94] =?UTF-8?q?=E6=92=A4=E9=94=80=E4=B8=8D=E5=BF=85?= =?UTF-8?q?=E8=A6=81=E7=9A=84=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/kj600/README.md | 13 ++++---- .../accuracy_tools/kj600/kj600/module_hook.py | 30 ++++++++++--------- .../kj600/kj600/optimizer_collect.py | 16 ++++------ 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index bf3f80b5a3c..9ef40ad2037 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -82,13 +82,12 @@ cd kj600/ python3 anomaly_analyse.py -d $KJ600_OUTPUT_DIR/anomaly_detected ``` 支持以下参数配置 - -| 字段名字 | 解释 | 是否必选 | -|------------------|-------------------------------------------------------------|------| -| -d 或 --data_path | 指定梯度异常落盘文件夹,梯度监控功能输出,一般为$KJ600_OUTPUT_DIR/anomaly_detected。 | 是 | -| -o 或 --out_path | 排序后的异常落盘文件地址,默认在--data_path路径下落盘一个anomaly_analyse.json文件 | 否 | -| -k 或 --topk | 指定保留前topk个异常,默认为8 | 否 | -| -s 或 --step_list | 指定分析的step范围,如[1,2,3],默认为[],表示使用全部step | 否 | +| 字段名字 | 解释 | 是否必选释 | +| ------ | -------- | -------- | +|-d 或 --data_path| 指定梯度异常落盘文件夹,梯度监控功能输出,一般为$KJ600_OUTPUT_DIR/anomaly_detected。|是 | +|-o 或 --out_path| 排序后的异常落盘文件地址,默认在--data_path路径下落盘一个anomaly_analyse.json文件| 否 | +|-k 或 --topk| 指定保留前topk个异常,默认为8| 否 | +|-s 或 --step_list| 指定分析的step范围,默认为[]| 否 | ## 详细配置 diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 08d4110bb97..a0cacbb7981 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -114,11 +114,11 @@ class TrainerMon: self.xy_distribution = self.config.get('xy_distribution', False) if not self.xy_distribution: print_rank_0("> module input/output input_grad/output_grad is not monitored. ") - + # backward hook cause megatron-lm pipeline parallel schedule assert exception. # TBD: backward hook cause output tensor is view of some base tensor. root cause invesigation pending. - self.forward_only = self.config.get('forward_only', False) - if self.forward_only: + self.forward_only = self.config.get('forward_only', False) + if self.forward_only: print_rank_0("> only module forward is monitored. ") self.ur_distribution = self.config.get('ur_distribution', False) @@ -147,14 +147,14 @@ class TrainerMon: alert_setting = self.config.get('alert', {"rules":[]}) self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"]) - + anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None self.optimizer_hooked = False self.vpp = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] - + if dist.is_initialized(): rank = dist.get_rank() tensorboard_dir = os.path.join(output_base_dir, f"{cur_time}-rank{rank}-{unique_id}") @@ -170,10 +170,10 @@ class TrainerMon: # 初始化AnomalyData工厂 self.anomaly_data_factory = AnomalyDataFactory(rank, pp_stage, group_mates) self.summary_writer = SummaryWriterWithAD( - tensorboard_dir, - self.alert_rules, - unique_id, - anomaly_inform, + tensorboard_dir, + self.alert_rules, + unique_id, + anomaly_inform, self.anomaly_data_factory) # 初始化anomaly deteted文件目录 self.anomaly_data_writer = AnomalyDataWriter( @@ -203,7 +203,7 @@ class TrainerMon: def __del__(self): if hasattr(self, "summary_writer"): self.summary_writer.close() - + @staticmethod def set_wrapped_optimizer(_wrapped_optimizer): OptimizerMon.set_wrapped_optimizer(_wrapped_optimizer) @@ -246,7 +246,7 @@ class TrainerMon: def monitor_gnorm_with_ad(self, model, grad_acc_steps, process_group=None): self.micro_batch_number = grad_acc_steps - + self._hook_weights(model) self.hook_optimizer() @@ -267,7 +267,7 @@ class TrainerMon: continue metrics[key] = param_tensor[name] return metrics - + def generate_cc_metrics(self, cc_name, cc_tensor): metrics = defaultdict(dict) rank = dist.get_rank() if dist.is_initialized() else None @@ -324,7 +324,8 @@ class TrainerMon: self._smallest_rank_print("> Used communication ops and corresponding stack") self._smallest_rank_print(json.dumps({k:[i.split(';') for i in v] for k,v in self.cc_logged_stack.items()}, indent=4)) raise Exception("exit after first step when print cc stack") - + + mv_result = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name) context.param_exp_avg = mv_result.exp_avg context.param_exp_avg_sq = mv_result.exp_avg_sq @@ -546,7 +547,8 @@ class TrainerMon: self.module_struct = { prefix + f"{module_name}": {} for module_name, _ in model_chunk.named_modules()} return - + for index, model_chunk in enumerate(model): vpp_stage = index if self.vpp else 0 register_hooks(model_chunk, vpp_stage=vpp_stage) + \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py index 0aea7ff0455..beb28924915 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py @@ -53,7 +53,7 @@ class MixPrecisionOptimizerMon(OptimizerMon): for param, name in params2name.items(): if param in self.fp16_to_fp32_param: param = self.fp16_to_fp32_param[param] - + if param in torch_opt.state: state_param = torch_opt.state.get(param, None) exp_avg = state_param.get("exp_avg", None) @@ -77,13 +77,11 @@ class MixPrecisionOptimizerMon(OptimizerMon): class MegatronDistributedOptimizerMon(MixPrecisionOptimizerMon): def fetch_mv(self, monitor, torch_opt, params2name): mix_prec_opt = self.wrapped_optimizer - if not (hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt, - "shard_fp32_from_float16_groups")): - raise Exception("megatron distributed optimizer should have model_float16_groups " - "and shard_fp32_from_float16_groups, if not, please check megatron-lm version") + if not (hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt, "shard_fp32_from_float16_groups")): + raise Exception("megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, \ + if not, please check megatron-lm version") if not self.fp16_to_fp32_param and mix_prec_opt is not None: - for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups, - mix_prec_opt.shard_fp32_from_float16_groups): + for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups, mix_prec_opt.shard_fp32_from_float16_groups): for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group): self.fp16_to_fp32_param[fp16_param] = shard_fp32_param @@ -134,6 +132,4 @@ class OptimizerMonFactory: return MegatronFP32OptimizerMon() if opt_ty is None or opt_ty == "unknown": return DummyOptimizerMon() - raise Exception( - "opt_ty should be Megatron_Float16OptimizerWithFloat16Params or Megatron_DistributedOptimizer or " - "Megatron_FP32Optimizer or None or unknown") + raise Exception("opt_ty should be Megatron_Float16OptimizerWithFloat16Params or Megatron_DistributedOptimizer or None or unknown") -- Gitee From 95a56cc6abee6a2a8a4bffa254cfc0fb1dfdab0b Mon Sep 17 00:00:00 2001 From: qianggee Date: Thu, 15 Aug 2024 12:03:28 +0000 Subject: [PATCH 89/94] add reviewer --- debug/OWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/OWNERS b/debug/OWNERS index 09121722c9d..84a4493dd24 100644 --- a/debug/OWNERS +++ b/debug/OWNERS @@ -9,3 +9,4 @@ reviewers: - lv-kaimeng - litian_drinksnow - binghamhuang +- xiangsen2 -- Gitee From c9dadba2d021f7bb398d69a3134a474bdff196f0 Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 16 Aug 2024 01:18:38 +0000 Subject: [PATCH 90/94] clear unused code --- debug/accuracy_tools/kj600/kj600/anomaly_inform.py | 1 - debug/accuracy_tools/kj600/kj600/optimizer_collect.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_inform.py b/debug/accuracy_tools/kj600/kj600/anomaly_inform.py index 301ac769217..485c06d4d24 100644 --- a/debug/accuracy_tools/kj600/kj600/anomaly_inform.py +++ b/debug/accuracy_tools/kj600/kj600/anomaly_inform.py @@ -1,6 +1,5 @@ import smtplib from email.mime.text import MIMEText -import sqlite3 from datetime import datetime, timedelta from kj600.database import Database, ExceptionMessage diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py index 285f17ca6dc..6a06c8d7caf 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py @@ -1,10 +1,9 @@ from collections import defaultdict import torch import torch.distributed as dist -from kj600.visualizer import HeatmapVisualizer -def print_rank_0(message, debug=False, force=False): +def print_rank_0(message): if dist.is_initialized(): if dist.get_rank() == 0: print(message) -- Gitee From 33fb083bbc2735edf8c4ad26847e0e6fa80b83c1 Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 16 Aug 2024 01:33:13 +0000 Subject: [PATCH 91/94] safe file open --- debug/accuracy_tools/kj600/kj600/module_hook.py | 6 ++++-- debug/accuracy_tools/kj600/kj600/module_spec_verifier.py | 7 ------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index c41dcaed721..01aaad6bdfe 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -8,7 +8,7 @@ import torch import torch.distributed as dist from torch import Stream from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook -from kj600.module_spec_verifier import get_config, validate_config_spec +from kj600.module_spec_verifier import validate_config_spec from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer @@ -19,6 +19,7 @@ from kj600.module_metric import get_metrics, write_metrics_tensorboard, write_me from kj600.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, get_process_group from kj600.utils import print_warn_log, print_info_log, get_param_struct from kj600.const import Const +from kj600.file_check import FileOpen try: @@ -114,7 +115,8 @@ class TrainerMon: self.grad_context = GradContext() self.process_group = get_process_group(process_group) self.params_have_main_grad = params_have_main_grad - self.config = get_config(config_file_path) + with FileOpen(config_file_path, 'r') as f: + self.config = json.load(f) self.module_rank_list = self.config.get("module_ranks", []) self.format = self.config.get('format', 'tensorboard') self.eps = self.config.get('eps', 1e-8) diff --git a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py b/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py index 395aa82f17a..66ea2805907 100644 --- a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py +++ b/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py @@ -2,15 +2,8 @@ import json import re import abc import torch -from kj600.utils import check_file_valid_readable -def get_config(file_path='config.json'): - check_file_valid_readable(file_path) - with open(file_path, 'r') as file: - config = json.load(file) - return config - # 用于存储所有validator实现类的注册表 config_validator_registry = {} -- Gitee From 941050a7d9e6d6b2702b6ddfb12f25c3f553fde9 Mon Sep 17 00:00:00 2001 From: heweidong7 <511650494@qq.com> Date: Fri, 16 Aug 2024 15:30:11 +0800 Subject: [PATCH 92/94] =?UTF-8?q?=E9=99=A4=E9=9B=B6=E4=BF=9D=E6=8A=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/kj600/kj600/optimizer_collect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py index beb28924915..08ada8ef001 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py @@ -68,7 +68,7 @@ class MixPrecisionOptimizerMon(OptimizerMon): exp_avg_dict[name] = exp_avg if monitor.ur_distribution: update_dict[name] = exp_avg / (torch.sqrt(exp_avg_sq) + torch_opt.defaults['eps']) - ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq) + ratio_dict[name] = (exp_avg / torch.sqrt(exp_avg_sq)).nan_to_num(0) monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name]) monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict) @@ -110,7 +110,7 @@ class MegatronFP32OptimizerMon(OptimizerMon): exp_avg_dict[name] = exp_avg if monitor.ur_distribution: update_dict[name] = exp_avg / (torch.sqrt(exp_avg_sq) + torch_opt.defaults['eps']) - ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq) + ratio_dict[name] = (exp_avg / torch.sqrt(exp_avg_sq)).nan_to_num(0) monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name]) monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict) -- Gitee From ff545364aa592af212fc74231df23b31f2b0b814 Mon Sep 17 00:00:00 2001 From: heweidong7 <511650494@qq.com> Date: Mon, 19 Aug 2024 10:00:47 +0800 Subject: [PATCH 93/94] =?UTF-8?q?=E5=BC=95=E7=94=A8bug=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/kj600/kj600/module_hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index d24a4012547..472be4def09 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -9,7 +9,7 @@ import torch.distributed as dist from torch import Stream from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook from kj600.module_spec_verifier import get_config, validate_config_spec -from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon +from kj600.optimizer_collect import OptimizerMon, print_rank_0, OptimizerMonFactory from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer from kj600.anomaly_detect import AnomalyScanner, AnomalyDataFactory, SummaryWriterWithAD, CSVWriterWithAD -- Gitee From 937ba7d2d311e85a8e74e38348df99b7162592a0 Mon Sep 17 00:00:00 2001 From: heweidong7 <511650494@qq.com> Date: Mon, 19 Aug 2024 10:42:54 +0800 Subject: [PATCH 94/94] =?UTF-8?q?=E5=8F=98=E9=87=8F=E5=90=8D=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/kj600/kj600/module_hook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 472be4def09..c6446d874fd 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -372,8 +372,8 @@ class TrainerMon: mv_result = self.mix_precision_optimizer_mon.fetch_mv(self, optimizer, self.param2name) context.param_exp_avg = mv_result.exp_avg context.param_exp_avg_sq = mv_result.exp_avg_sq - context.param_adam_update = mv_result.adam_update - context.param_adam_ratio = mv_result.adam_ratio + context.param_adam_update = mv_result.update + context.param_adam_ratio = mv_result.ratio if self.wg_distribution: if self.weight_hooked: -- Gitee