From 489a5dddb871a0e329879c8776404357d7eeab95 Mon Sep 17 00:00:00 2001 From: makai Date: Sat, 3 Aug 2024 14:24:44 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E3=80=90Bugfix=E3=80=91=E5=BD=93=E8=B6=85?= =?UTF-8?q?=E8=BF=87=E6=BA=A2=E5=87=BA=E6=AC=A1=E6=95=B0=E5=90=8E=EF=BC=8C?= =?UTF-8?q?=E5=85=88=E4=BF=9D=E5=AD=98=E6=95=B0=E6=8D=AE=EF=BC=8Cmsprobe?= =?UTF-8?q?=E5=86=8D=E9=80=80=E5=87=BA=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/data_dump/data_collector.py | 2 +- .../data_dump/data_processor/pytorch_processor.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index db437539af..9e7edae551 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -112,7 +112,7 @@ class DataCollector: self.data_writer.update_construct(self.module_processor.module_node) def handle_data(self, name, data_info, use_buffer=True): - msg = f"msProbe is collecting data on {name}. " + msg = f"msprobe is collecting data on {name}. " if data_info: msg = self.update_data(data_info, msg) logger.info(msg) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 007fec8096..d63664f31b 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -209,16 +209,17 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): for file_path, tensor in self.cached_tensors_and_file_paths.items(): torch.save(tensor, file_path) change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) - self.inc_and_check_overflow_times() + self.real_overflow_dump_times += 1 self.cached_tensors_and_file_paths = {} - def inc_and_check_overflow_times(self): - self.real_overflow_dump_times += 1 + def stop_run(self): if self.overflow_nums == -1: - return + return False if self.real_overflow_dump_times >= self.overflow_nums: - raise MsprobeException(MsprobeException.OVERFLOW_NUMS_ERROR, str(self.real_overflow_dump_times)) - + logger.warning(f"[msprobe] 超过预设溢出次数 当前溢出次数:{self.real_overflow_dump_times}") + return True + return False + def check_overflow_npu(self): if self.overflow_debug_mode_enalbe(): float_status = torch.zeros(self.bits_for_overflow).npu() -- Gitee From cf029aaba8806c43fcf89bbccdfbf0b8327f4973 Mon Sep 17 00:00:00 2001 From: makai Date: Mon, 5 Aug 2024 16:34:38 +0800 Subject: [PATCH 2/5] rename stop_run --- .../accuracy_tools/msprobe/core/data_dump/data_collector.py | 4 ++-- .../msprobe/core/data_dump/data_processor/base.py | 5 +++-- .../core/data_dump/data_processor/mindspore_processor.py | 3 ++- .../core/data_dump/data_processor/pytorch_processor.py | 3 ++- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index 9e7edae551..be7a79c7ff 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -90,7 +90,7 @@ class DataCollector: if self.config.level == "L2": return self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name)) - if self.data_processor.stop_run(): + if self.data_processor.is_should_stop_run: self.handle_data(name, data_info, use_buffer=False) raise Exception("[msprobe] exit") self.handle_data(name, data_info) @@ -101,7 +101,7 @@ class DataCollector: return data_info = self.data_processor.analyze_backward(name, module, module_input_output) - if self.data_processor.stop_run(): + if self.data_processor.is_should_stop_run: self.handle_data(name, data_info, use_buffer=False) raise Exception("[msprobe] exit") self.handle_data(name, data_info) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index cb7d31c606..68b8fdb2f6 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -235,5 +235,6 @@ class BaseDataProcessor: file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name) return dump_data_name, file_path - def stop_run(self): - return False + # @property + # def is_should_stop_run(self): + # return False diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index c208df7d90..a6fe0044d5 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -178,7 +178,8 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor): self.real_overflow_dump_times += 1 self.cached_tensors_and_file_paths = {} - def stop_run(self): + @property + def is_should_stop_run(self): if self.overflow_nums == -1: return False if self.real_overflow_dump_times >= self.overflow_nums: diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index d63664f31b..0043a48188 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -212,7 +212,8 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): self.real_overflow_dump_times += 1 self.cached_tensors_and_file_paths = {} - def stop_run(self): + @property + def is_should_stop_run(self): if self.overflow_nums == -1: return False if self.real_overflow_dump_times >= self.overflow_nums: -- Gitee From d55dc9fded99164309a53bfb6b4a8de708d78837 Mon Sep 17 00:00:00 2001 From: makai Date: Mon, 5 Aug 2024 18:45:12 +0800 Subject: [PATCH 3/5] undo annotation --- .../msprobe/core/data_dump/data_processor/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index 68b8fdb2f6..fc4033fe22 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -235,6 +235,6 @@ class BaseDataProcessor: file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name) return dump_data_name, file_path - # @property - # def is_should_stop_run(self): - # return False + @property + def is_should_stop_run(self): + return False -- Gitee From 7f0ddd46afdbaf6835de45a28442d344077e550e Mon Sep 17 00:00:00 2001 From: makai Date: Mon, 5 Aug 2024 19:28:14 +0800 Subject: [PATCH 4/5] rename as is --- debug/accuracy_tools/msprobe/core/data_dump/data_collector.py | 4 ++-- .../msprobe/core/data_dump/data_processor/base.py | 2 +- .../core/data_dump/data_processor/mindspore_processor.py | 4 ++-- .../core/data_dump/data_processor/pytorch_processor.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index be7a79c7ff..35cd02cc9c 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -90,7 +90,7 @@ class DataCollector: if self.config.level == "L2": return self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name)) - if self.data_processor.is_should_stop_run: + if self.data_processor.is_termination: self.handle_data(name, data_info, use_buffer=False) raise Exception("[msprobe] exit") self.handle_data(name, data_info) @@ -101,7 +101,7 @@ class DataCollector: return data_info = self.data_processor.analyze_backward(name, module, module_input_output) - if self.data_processor.is_should_stop_run: + if self.data_processor.is_termination: self.handle_data(name, data_info, use_buffer=False) raise Exception("[msprobe] exit") self.handle_data(name, data_info) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index fc4033fe22..f8e6625564 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -236,5 +236,5 @@ class BaseDataProcessor: return dump_data_name, file_path @property - def is_should_stop_run(self): + def is_termination(self): return False diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index a6fe0044d5..38779b0609 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -179,11 +179,11 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor): self.cached_tensors_and_file_paths = {} @property - def is_should_stop_run(self): + def is_termination(self): if self.overflow_nums == -1: return False if self.real_overflow_dump_times >= self.overflow_nums: - logger.warning(f"[msprobe] 超过预设溢出次数 当前溢出次数: {self.real_overflow_dump_times}") + logger.info(f"[msprobe] 超过预设溢出次数 当前溢出次数: {self.real_overflow_dump_times}") return True return False diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 0043a48188..4a14560061 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -213,11 +213,11 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): self.cached_tensors_and_file_paths = {} @property - def is_should_stop_run(self): + def is_termination(self): if self.overflow_nums == -1: return False if self.real_overflow_dump_times >= self.overflow_nums: - logger.warning(f"[msprobe] 超过预设溢出次数 当前溢出次数:{self.real_overflow_dump_times}") + logger.info(f"[msprobe] 超过预设溢出次数 当前溢出次数:{self.real_overflow_dump_times}") return True return False -- Gitee From e5bdb98239285c53503014adb4d2c6c44d330bd7 Mon Sep 17 00:00:00 2001 From: makai Date: Mon, 5 Aug 2024 20:16:06 +0800 Subject: [PATCH 5/5] rename as is_terminated --- debug/accuracy_tools/msprobe/core/data_dump/data_collector.py | 4 ++-- .../msprobe/core/data_dump/data_processor/base.py | 2 +- .../core/data_dump/data_processor/mindspore_processor.py | 2 +- .../core/data_dump/data_processor/pytorch_processor.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index 35cd02cc9c..aa93a12996 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -90,7 +90,7 @@ class DataCollector: if self.config.level == "L2": return self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name)) - if self.data_processor.is_termination: + if self.data_processor.is_terminated: self.handle_data(name, data_info, use_buffer=False) raise Exception("[msprobe] exit") self.handle_data(name, data_info) @@ -101,7 +101,7 @@ class DataCollector: return data_info = self.data_processor.analyze_backward(name, module, module_input_output) - if self.data_processor.is_termination: + if self.data_processor.is_terminated: self.handle_data(name, data_info, use_buffer=False) raise Exception("[msprobe] exit") self.handle_data(name, data_info) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index f8e6625564..ecca712082 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -236,5 +236,5 @@ class BaseDataProcessor: return dump_data_name, file_path @property - def is_termination(self): + def is_terminated(self): return False diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 38779b0609..457abb6976 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -179,7 +179,7 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor): self.cached_tensors_and_file_paths = {} @property - def is_termination(self): + def is_terminated(self): if self.overflow_nums == -1: return False if self.real_overflow_dump_times >= self.overflow_nums: diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 4a14560061..f528e39de7 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -213,7 +213,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): self.cached_tensors_and_file_paths = {} @property - def is_termination(self): + def is_terminated(self): if self.overflow_nums == -1: return False if self.real_overflow_dump_times >= self.overflow_nums: -- Gitee