diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index 5d9012919735998f7e66c447cb0d4134c0c409d4..d316ac8f781376736c88091e2db008434acf227a 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -108,17 +108,6 @@ class BaseDataProcessor: def _analyze_numpy(value, numpy_type): return {"type": numpy_type, "value": value} - @staticmethod - def _analyze_builtin(arg): - single_arg = {} - if isinstance(arg, slice): - single_arg.update({"type": "slice"}) - single_arg.update({"value": [arg.start, arg.stop, arg.step]}) - else: - single_arg.update({"type": type(arg).__name__}) - single_arg.update({"value": arg}) - return single_arg - @classmethod def get_special_types(cls): return cls.special_type @@ -183,7 +172,7 @@ class BaseDataProcessor: def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): api_info_struct = {} - if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): # check whether data_mode contains forward or input + if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): api_info_struct[name] = {} self.api_data_category = Const.INPUT args_info_list = self.analyze_element(module_input_output.args_tuple) @@ -192,7 +181,7 @@ class BaseDataProcessor: kwargs_info_list = self.analyze_element(module_input_output.kwargs) api_info_struct[name][Const.INPUT_KWARGS] = kwargs_info_list - if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT): # check whether data_mode contains forward or output + if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT): api_info_struct[name] = api_info_struct.get(name, {}) self.api_data_category = Const.OUTPUT output_info_list = self.analyze_element(module_input_output.output_tuple) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 7533e2ee0de79d9d78a31e3e8813e4dfb0edb918..44a382ad31407b1be8ecedfd49a5d5305554a10e 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -48,6 +48,22 @@ class MindsporeDataProcessor(BaseDataProcessor): def analyze_dtype_in_kwargs(element): return {"type": "mindspore.dtype", "value": str(element)} + @staticmethod + def _analyze_builtin(arg): + single_arg = {} + if isinstance(arg, slice): + single_arg.update({"type": "slice"}) + # slice参数中可能存在tensor类型,json序列化,需要转换为python数值类型 + values = [ + value if not isinstance(value, ms.Tensor) else value.item() + for value in [arg.start, arg.stop, arg.step] + ] + single_arg.update({"value": values}) + else: + single_arg.update({"type": type(arg).__name__}) + single_arg.update({"value": arg}) + return single_arg + @classmethod def get_special_types(cls): return super().get_special_types() + cls.mindspore_special_type diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 1c599573c070f3f2580419fb156ec066883887aa..3e22a2b64ca65ee534e86bc14741a5c8672b7cef 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -78,9 +78,41 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_stat.norm = torch._C._VariableFunctionsClass.norm(data_clone).item() return tensor_stat + @staticmethod + def _analyze_builtin(arg): + single_arg = {} + if isinstance(arg, slice): + single_arg.update({"type": "slice"}) + # slice参数中可能存在tensor类型,json序列化,需要转换为python数值类型 + values = [ + value if not isinstance(value, torch.Tensor) else value.item() + for value in [arg.start, arg.stop, arg.step] + ] + single_arg.update({"value": values}) + else: + single_arg.update({"type": type(arg).__name__}) + single_arg.update({"value": arg}) + return single_arg + @staticmethod def _analyze_torch_size(arg): return {"type": "torch.Size", "value": list(arg)} + + @staticmethod + def handle_tensor_extremum_nan_inf(tensor, operator): + data_clone = tensor.detach() + data_nan = torch._C._VariableFunctionsClass.isnan(data_clone) + if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel(): + return float('nan') + finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone) + if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0: + finite_values = data_clone[finite_mask] + return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(finite_values).item() + else: + data_no_nan = data_clone[~data_nan] + return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(data_no_nan).item() @classmethod def get_special_types(cls): @@ -114,6 +146,12 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_json.update({"Mean": tensor_stat.mean}) tensor_json.update({"Norm": tensor_stat.norm}) tensor_json.update({"requires_grad": tensor.requires_grad}) + + if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max): + tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max") + if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min): + tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min") + if self.config.summary_mode == "md5": tensor_md5 = self.get_md5_for_tensor(tensor) tensor_json.update({"md5": tensor_md5}) @@ -152,21 +190,6 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE) return overflow_mode == Const.ENV_ENABLE - @staticmethod - def handle_tensor_extremum_nan_inf(data_clone, operator): - data_nan = torch._C._VariableFunctionsClass.isnan(data_clone) - if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel(): - return float('nan') - finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone) - if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0: - finite_values = data_clone[finite_mask] - return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \ - torch._C._VariableFunctionsClass.min(finite_values).item() - else: - data_no_nan = data_clone[~data_nan] - return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \ - torch._C._VariableFunctionsClass.min(data_no_nan).item() - def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): self.has_overflow = False api_info_struct = super().analyze_forward(name, module, module_input_output) @@ -212,16 +235,13 @@ class OverflowCheckDataProcessor(PytorchDataProcessor): else: torch_npu._C._clear_overflow_npu() - def _analyze_maybe_overflow_tensor(self, tensor_json, tensor): - data_clone = tensor.detach() + def _analyze_maybe_overflow_tensor(self, tensor_json): if is_gpu or (hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan()): if tensor_json['Max'] is None: return if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']): - tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "max") self.has_overflow = True if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']): - tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "min") self.has_overflow = True else: self.has_overflow = self.check_overflow_npu()