From b79e5bb98c488006a3075ee4c4f079e99ddc2333 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Fri, 22 Nov 2024 11:33:20 +0800 Subject: [PATCH 01/15] test --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ea548a0bfca..623c205332c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🚨 重要通知 +# 🚨 重要通知: **1. Ascend Training Tools 更名为 MindStudio Training Tools (mstt)。** -- Gitee From e76ef748d55fd6d00339e2db25d67ab008fb249b Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Fri, 22 Nov 2024 12:12:13 +0800 Subject: [PATCH 02/15] add free-benchmark auto-fix --- .../pytorch/free_benchmark/common/enums.py | 6 +- .../pytorch/free_benchmark/common/utils.py | 7 +- .../msprobe/pytorch/free_benchmark/main.py | 3 + .../perturbed_layers/layer_factory.py | 5 +- .../perturbed_layers/npu/auto_fix.py | 230 ++++++++++++++++++ .../perturbed_layers/npu/scale.py | 132 ++++++++++ 6 files changed, 379 insertions(+), 4 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py index 181631c624a..cb1654683f9 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py @@ -8,6 +8,8 @@ class PerturbationMode: NO_CHANGE = "no_change" BIT_NOISE = "bit_noise" TO_CPU = "to_cpu" + AUTO = "auto_fix" + SCALE = "scale" class DeviceType: @@ -48,6 +50,8 @@ class PytorchFreeBenchmarkConst: PerturbationMode.NO_CHANGE, PerturbationMode.BIT_NOISE, PerturbationMode.TO_CPU, + PerturbationMode.AUTO, + PerturbationMode.SCALE ] DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU] @@ -57,7 +61,7 @@ class PytorchFreeBenchmarkConst: FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL] DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD] - FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU] + FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO, PerturbationMode.SCALE] DEFAULT_FUZZ_STAGE = Const.FORWARD DEFAULT_PREHEAT_STEP = 15 DEFAULT_MAX_SAMPLE = 20 diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py index 391c2ceaca0..4a46ac3a91f 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py @@ -16,7 +16,7 @@ import torch from msprobe.core.common.exceptions import FreeBenchmarkException from msprobe.pytorch.free_benchmark.common.enums import DeviceType - +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode class Tools: @@ -74,8 +74,11 @@ class Tools: return tensor_seq @staticmethod - def convert_fuzz_output_to_origin(origin, perturbed): + def convert_fuzz_output_to_origin(origin, perturbed, pert_mode): if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor): + if pert_mode == PerturbationMode.AUTO: + origin.data = perturbed.to(origin.device) + return origin #!guo origin.data = perturbed.to(origin.dtype).to(origin.device) return origin if isinstance(origin, dict) and isinstance(perturbed, dict): diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py index 66d7b7e1042..1d3d6fdfe37 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py @@ -88,6 +88,9 @@ class FreeBenchmarkCheck(ABC): layer.handle(data_params) handler_params = make_handler_params(name, self.config, self.current_iter) handler = FuzzHandlerFactory.create(handler_params) + if handler_params.pert_mode == PerturbationMode.AUTO: + perturbed_output = handler.handle(data_params, handler_params.pert_mode) + return perturbed_output, handler.get_unequal_rows() perturbed_output = handler.handle(data_params) return perturbed_output, handler.get_unequal_rows() diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py index 79256cd4063..dcbd6739fc8 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py @@ -25,7 +25,8 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.improve_precision impor ) from msprobe.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer from msprobe.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer - +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.auto_fix import AutoLayer +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.scale import ScaleLayer class LayerFactory: layers = { @@ -35,6 +36,8 @@ class LayerFactory: PerturbationMode.NO_CHANGE: NoChangeLayer, PerturbationMode.BIT_NOISE: BitNoiseLayer, PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer, + PerturbationMode.AUTO: AutoLayer, + PerturbationMode.SCALE: ScaleLayer, }, DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer}, } diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py new file mode 100644 index 00000000000..810bd33eb7f --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -0,0 +1,230 @@ +# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch_npu +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import CommonField +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + +class ScaleConst: + """ + Class for ScaleLayer's const + """ + SOFTMAX_NAME = "softmax" + LINEAR_NAME = "linear" + MATMUL_NAME = "matmul" + + FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix + FP16_UB = torch.finfo(torch.float16).max + + import numpy as np + SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu() + + COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] + +class AutoLayer(NpuBaseLayer): + scale_var = 1.0 + + def check_catastrophe(self, tensor_obj): + if isinstance(tensor_obj, torch.Tensor): + if torch.all(tensor_obj.eq(0)): + return True + if torch.isinf(tensor_obj).any(): + return True + if torch.isnan(tensor_obj).any(): + return True + return False + if isinstance(tensor_obj, dict): + return any(self.check_catastrophe(value) for value in tensor_obj.values()) + if isinstance(tensor_obj, (tuple, list)): + return any(self.check_catastrophe(value) for value in tensor_obj) + return False + + def tensor_scale(self, tensor_obj,unscale=False): + if isinstance(tensor_obj, torch.Tensor): + if(unscale): + tensor_obj = self._unscale(tensor_obj) + else: + tensor_obj = self._scale(tensor_obj) + self.is_added = True + return tensor_obj + if isinstance(tensor_obj, dict): + return { + key: self.tensor_scale(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.tensor_scale(value) for value in tensor_obj] + ) + return tensor_obj + + def tensor_contiguous(self, tensor_obj): + if isinstance(tensor_obj, torch.Tensor): + return tensor_obj.contiguous() + if isinstance(tensor_obj, dict): + return { + key: self.tensor_contiguous(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.tensor_contiguous(value) for value in tensor_obj] + ) + return tensor_obj + + def improve_tensor_precision(self, tensor_obj): + if ( + isinstance(tensor_obj, torch.Tensor) + and torch.is_floating_point(tensor_obj) + and tensor_obj.dtype not in [torch.float32, torch.float64] + ): + self._set_improve_values(tensor_obj) + tensor_obj = self._change_dtype(tensor_obj) + self.is_added = True + return tensor_obj + if isinstance(tensor_obj, dict): + return { + key: self.improve_tensor_precision(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.improve_tensor_precision(value) for value in tensor_obj] + ) + return tensor_obj + + def handle(self, params: DataParams): + # Try Scale + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Perturbation is " + f"{PerturbationMode.AUTO} of {self.api_name}." + f"Trying Scale for this" + ) + for x in ScaleConst.COMMUNICATION_NAMES: + if x in self.api_name: + params.perturbed_result=(params.original_result)/AutoLayer.scale_var + AutoLayer.scale_var = 1.0 + + if(self.check_catastrophe(params.args)): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!") + params.perturbed_result = params.original_result + + if(ScaleConst.SOFTMAX_NAME in self.api_name): + new_args = params.args + else: + new_args = self.tensor_scale(params.args) + + params.perturbed_result = params.origin_func(*new_args, **params.kwargs) + + if(ScaleConst.SOFTMAX_NAME in self.api_name): + params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True) + try: + new_args1 = params.perturbed_result,*new_args[1:] + params.perturbed_result = params.origin_func(*new_args1, + **params.kwargs) + except KeyError as e: + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") + + # Try Improve Precision + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Auto Scaler is Useless " + f"Trying to improve precision for" + f"{PerturbationMode.AUTO} of {self.api_name}." + ) + new_args = self.improve_tensor_precision(params.args) + if params.fuzz_stage == Const.BACKWARD: + new_kwargs = {} + else: + new_kwargs = self.improve_tensor_precision(params.kwargs) + # 如果输入中全为高精度、应跳过二次执行、减少多余显存引用 + if not self.is_added: + return params.perturbed_result + if "inplace" in new_kwargs: + new_kwargs["inplace"] = False + params.perturbed_result = params.origin_func(*new_args, **new_kwargs) + + # Try Synchronize + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Auto Scaler is Useless " + f"Trying Synchronize for" + f"{PerturbationMode.AUTO} of {self.api_name}." + ) + torch_npu.npu.synchronize() + params.perturbed_result = params.origin_func(*params.args, **params.kwargs) + torch_npu.npu.synchronize() + + # Try Contiguous + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Synchronize is Useless, too " + f"Trying 'Contiguous' for" + f"{PerturbationMode.AUTO} of {self.api_name}." + ) + new_args = self.tensor_contiguous(params.args) + new_kwargs = self.tensor_contiguous(params.kwargs) + if not self.is_added: + return params.perturbed_result + params.perturbed_result = params.origin_func(*new_args, **new_kwargs) + + # Hint to 'tocpu' + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Contiguous is Useless, too " + f"Please set pert_mode to 'tocpu' for further check." + ) + return params.original_result + + def _scale(self, inputs): + self.scale_factor = (ScaleConst.SQRT_UB + / torch.maximum(ScaleConst.SQRT_UB, + torch.norm(inputs,p=1,dim=-1).max())) + scaled_inputs = inputs * self.scale_factor + AutoLayer.scale_var *= self.scale_factor + if AutoLayer.scale_var < ScaleConst.FP16_EPS: + AutoLayer.scale_var = ScaleConst.FP16_EPS + return scaled_inputs + + def _unscale(self, output): + if(ScaleConst.SOFTMAX_NAME in self.api_name): + unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach() + AutoLayer.scale_var = 1.0 + return unscaled_outputs + def _set_improve_values(self, inputs): + if inputs.dtype in [torch.float16, torch.bfloat16]: + self.perturbed_value = torch.float32 + + def _change_dtype(self, inputs): + if hasattr(inputs, CommonField.DEVICE): + device = inputs.device + if device is CommonField.META: + new_inputs = inputs.to( + device=CommonField.META, dtype=self.perturbed_value + ) + else: + new_inputs = inputs.to(dtype=self.perturbed_value).to(device) + else: + new_inputs = inputs.to(dtype=self.perturbed_value) + return new_inputs \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py new file mode 100644 index 00000000000..beaaa6c34bf --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py @@ -0,0 +1,132 @@ +# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#!guo @Rodin +import torch +import torch_npu + +from typing import Any, Callable, Dict, List, Optional, Tuple + +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import CommonField +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + +class ScaleConst: + """ + Class for ScaleLayer's const + """ + SOFTMAX_NAME = "softmax" + LINEAR_NAME = "linear" + MATMUL_NAME = "matmul" + + FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix + FP16_UB = torch.finfo(torch.float16).max + + import numpy as np + SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu() + + COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] + + +class ScaleLayer(NpuBaseLayer): + var = 1.0 + + def is_nonvalid(self, tensor_obj): + if isinstance(tensor_obj, torch.Tensor): + if not torch.isfinite(tensor_obj).all(): + return True + return False + if isinstance(tensor_obj, dict): + return any(self.is_nonvalid(value) for value in tensor_obj.values()) + if isinstance(tensor_obj, (tuple, list)): + return any(self.is_nonvalid(value) for value in tensor_obj) + return False + + def tensor_scale(self, tensor_obj,unscale=False): + if isinstance(tensor_obj, torch.Tensor): + if(unscale): + tensor_obj = self._unscale(tensor_obj) + else: + tensor_obj = self._scale(tensor_obj) + self.is_added = True + return tensor_obj + if isinstance(tensor_obj, dict): + return { + key: self.tensor_scale(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.tensor_scale(value) for value in tensor_obj] + ) + return tensor_obj + + def handle(self, params: DataParams): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Perturbation is " + f"{PerturbationMode.SCALE} of {self.api_name}.") + + for x in ScaleConst.COMMUNICATION_NAMES: + if x in self.api_name: + params.perturbed_result=(params.original_result)/ScaleLayer.var + ScaleLayer.var = 1.0 + return params.perturbed_result + + if(self.is_nonvalid(params.args)): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!") + params.perturbed_result = params.original_result + return params.perturbed_result + + if(ScaleConst.SOFTMAX_NAME in self.api_name): + new_args = params.args + else: + new_args = self.tensor_scale(params.args) + + params.perturbed_result = params.origin_func(*new_args, **params.kwargs) + + if(ScaleConst.SOFTMAX_NAME in self.api_name): + params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True) + try: + new_args1 = params.perturbed_result,*new_args[1:] + params.perturbed_result = params.origin_func(*new_args1, + **params.kwargs) + except KeyError as e: + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") + + return params.perturbed_result + + + def _scale(self, inputs): + self.scale_factor = (ScaleConst.SQRT_UB + / torch.maximum(ScaleConst.SQRT_UB, + torch.norm(inputs,p=1,dim=-1).max())) + scaled_inputs = inputs * self.scale_factor + ScaleLayer.var *= self.scale_factor + if ScaleLayer.var < ScaleConst.FP16_EPS: + ScaleLayer.var = ScaleConst.FP16_EPS + return scaled_inputs + + def _unscale(self, output): + if(ScaleConst.SOFTMAX_NAME in self.api_name): + unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / ScaleLayer.var).detach() + ScaleLayer.var = 1.0 + return unscaled_outputs + \ No newline at end of file -- Gitee From 06c0ba77c42a0bff3d4ae8e7175e8449d2875a82 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Fri, 22 Nov 2024 18:01:43 +0800 Subject: [PATCH 03/15] add free-benchmark auto-fix; delete scale --- .../docs/15.free_benchmarking_PyTorch.md | 7 +- .../pytorch/free_benchmark/common/enums.py | 8 +- .../msprobe/pytorch/free_benchmark/main.py | 1 + .../perturbed_layers/layer_factory.py | 2 - .../perturbed_layers/npu/scale.py | 132 ------------------ 5 files changed, 8 insertions(+), 142 deletions(-) delete mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md index a2bc2112c16..af3b9c9fef2 100644 --- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md @@ -20,7 +20,7 @@ 2. **扰动因子**:基于torch.nn.Module的hook机制,在注册的hook函数中对算子输入进行特定类型扰动。 3. **误差分析**: * **check**: 在hook函数中二次执行算子得到扰动后的算子输出,计算扰动后输出与原始输出的相对误差,查看是否符合精度标准; - * **fix**: 需要做验证时,可以选择将特定扰动类型(升精度,to cpu)的输出替换原始输出,观察对模型Loss是否有影响。 + * **fix**: 需要做验证时,可以选择将特定扰动类型(升精度,to cpu)的输出替换原始输出,观察对模型Loss是否有影响;需要恢复算子时,可以选择自动恢复,工具将自动执行——检测前向中Nan/inf/全0问题,然后基于缩放->切高精度->Synchronize->Contiguous->引导tocpu的顺序进行排查替换。 4. **精度风险算子**:不达标精度标准的,最终会在输出件中展示 ![alt text](./img/free_benchmark_framework.png) @@ -96,12 +96,13 @@ D-->config.json配置 - + +
参数是否必选可配置项适用场景
pert_mode"improve_precision" (默认)(常用)(可做验证) 插桩算子可能在低精度下有精度问题,扰动因子会将输入的低精度向量升精度。
pert_mode"improve_precision" (默认)(常用)(可做验证) 插桩算子可能在低精度下有精度问题,扰动因子会将输入的低精度向量升精度。
"bit_noise"(常用)插桩算子可能在轻微扰动下暴露精度问题,扰动因子会将输入向量最后一个比特位翻转。
"add_noise"插桩算子可能在轻微扰动下暴露精度问题,扰动因子会为输入向量增加一个极小。
"change_value"插桩算子可能存在大数吃小数问题,扰动因子会交换输入向量的首尾。
"no_change"插桩算子可能存在数值稳定性精度问题,扰动因子会复制原始输。
"to_cpu"(可做验证) 插桩算子可能在同 CPU 精度表现不一致,扰动因子会将输入转至 CPU,需要配合 fuzz_device="cpu"使用。
"auto_fix"(专做修复) 已有怀疑算子,实现自动恢复,检测前向中Nan/inf/全0问题,按照缩放->切高精度->Synchronize->Contiguous->引导tocpu的顺序进行排查替换,快速恢复。
fuzz_device"npu" (默认)pert_mode 不需要to cpu操作。
"cpu"pert_mode 须配置为"to_cpu",目前仅支持"to cpu"扰动因子。
@@ -111,7 +112,7 @@ D-->config.json配置 - +
参数是否必选可配置项适用场景
handler_type"check"(默认)要做精度问题算子排查,输出扰动前后不符合精度标准的算子,支持所有扰动因子。
"fix"要做可疑算子验证,用扰动后输出替换原始输出,支持"improve_precision","to_cpu"两种扰动因子。
"fix"要做可疑算子验证,用扰动后输出替换原始输出,支持"improve_precision","to_cpu"两种扰动因子;要做快速修复,用扰动后输出替换原始输出,支持"auto_fix"。
### 3.3 在模型脚本中开启工具 diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py index cb1654683f9..930aeff21a1 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py @@ -8,8 +8,7 @@ class PerturbationMode: NO_CHANGE = "no_change" BIT_NOISE = "bit_noise" TO_CPU = "to_cpu" - AUTO = "auto_fix" - SCALE = "scale" + AUTO = "auto_fix"#!guo class DeviceType: @@ -50,8 +49,7 @@ class PytorchFreeBenchmarkConst: PerturbationMode.NO_CHANGE, PerturbationMode.BIT_NOISE, PerturbationMode.TO_CPU, - PerturbationMode.AUTO, - PerturbationMode.SCALE + PerturbationMode.AUTO,#!guo ] DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU] @@ -61,7 +59,7 @@ class PytorchFreeBenchmarkConst: FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL] DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD] - FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO, PerturbationMode.SCALE] + FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]#!guo DEFAULT_FUZZ_STAGE = Const.FORWARD DEFAULT_PREHEAT_STEP = 15 DEFAULT_MAX_SAMPLE = 20 diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py index 1d3d6fdfe37..439a4cdf5c8 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py @@ -88,6 +88,7 @@ class FreeBenchmarkCheck(ABC): layer.handle(data_params) handler_params = make_handler_params(name, self.config, self.current_iter) handler = FuzzHandlerFactory.create(handler_params) + #!guo if handler_params.pert_mode == PerturbationMode.AUTO: perturbed_output = handler.handle(data_params, handler_params.pert_mode) return perturbed_output, handler.get_unequal_rows() diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py index dcbd6739fc8..15005ecd394 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py @@ -26,7 +26,6 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.improve_precision impor from msprobe.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer from msprobe.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer from msprobe.pytorch.free_benchmark.perturbed_layers.npu.auto_fix import AutoLayer -from msprobe.pytorch.free_benchmark.perturbed_layers.npu.scale import ScaleLayer class LayerFactory: layers = { @@ -37,7 +36,6 @@ class LayerFactory: PerturbationMode.BIT_NOISE: BitNoiseLayer, PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer, PerturbationMode.AUTO: AutoLayer, - PerturbationMode.SCALE: ScaleLayer, }, DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer}, } diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py deleted file mode 100644 index beaaa6c34bf..00000000000 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py +++ /dev/null @@ -1,132 +0,0 @@ -# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#!guo @Rodin -import torch -import torch_npu - -from typing import Any, Callable, Dict, List, Optional, Tuple - -from msprobe.core.common.const import Const -from msprobe.pytorch.free_benchmark import logger -from msprobe.pytorch.free_benchmark.common.constant import CommonField -from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode -from msprobe.pytorch.free_benchmark.common.params import DataParams -from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( - NpuBaseLayer, -) - -class ScaleConst: - """ - Class for ScaleLayer's const - """ - SOFTMAX_NAME = "softmax" - LINEAR_NAME = "linear" - MATMUL_NAME = "matmul" - - FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix - FP16_UB = torch.finfo(torch.float16).max - - import numpy as np - SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu() - - COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] - - -class ScaleLayer(NpuBaseLayer): - var = 1.0 - - def is_nonvalid(self, tensor_obj): - if isinstance(tensor_obj, torch.Tensor): - if not torch.isfinite(tensor_obj).all(): - return True - return False - if isinstance(tensor_obj, dict): - return any(self.is_nonvalid(value) for value in tensor_obj.values()) - if isinstance(tensor_obj, (tuple, list)): - return any(self.is_nonvalid(value) for value in tensor_obj) - return False - - def tensor_scale(self, tensor_obj,unscale=False): - if isinstance(tensor_obj, torch.Tensor): - if(unscale): - tensor_obj = self._unscale(tensor_obj) - else: - tensor_obj = self._scale(tensor_obj) - self.is_added = True - return tensor_obj - if isinstance(tensor_obj, dict): - return { - key: self.tensor_scale(value) - for key, value in tensor_obj.items() - } - if isinstance(tensor_obj, (tuple, list)): - return type(tensor_obj)( - [self.tensor_scale(value) for value in tensor_obj] - ) - return tensor_obj - - def handle(self, params: DataParams): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Perturbation is " - f"{PerturbationMode.SCALE} of {self.api_name}.") - - for x in ScaleConst.COMMUNICATION_NAMES: - if x in self.api_name: - params.perturbed_result=(params.original_result)/ScaleLayer.var - ScaleLayer.var = 1.0 - return params.perturbed_result - - if(self.is_nonvalid(params.args)): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!") - params.perturbed_result = params.original_result - return params.perturbed_result - - if(ScaleConst.SOFTMAX_NAME in self.api_name): - new_args = params.args - else: - new_args = self.tensor_scale(params.args) - - params.perturbed_result = params.origin_func(*new_args, **params.kwargs) - - if(ScaleConst.SOFTMAX_NAME in self.api_name): - params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True) - try: - new_args1 = params.perturbed_result,*new_args[1:] - params.perturbed_result = params.origin_func(*new_args1, - **params.kwargs) - except KeyError as e: - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") - - return params.perturbed_result - - - def _scale(self, inputs): - self.scale_factor = (ScaleConst.SQRT_UB - / torch.maximum(ScaleConst.SQRT_UB, - torch.norm(inputs,p=1,dim=-1).max())) - scaled_inputs = inputs * self.scale_factor - ScaleLayer.var *= self.scale_factor - if ScaleLayer.var < ScaleConst.FP16_EPS: - ScaleLayer.var = ScaleConst.FP16_EPS - return scaled_inputs - - def _unscale(self, output): - if(ScaleConst.SOFTMAX_NAME in self.api_name): - unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / ScaleLayer.var).detach() - ScaleLayer.var = 1.0 - return unscaled_outputs - \ No newline at end of file -- Gitee From 66e6e9618b2a32665b5409144bfa5fb27fbafe6a Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Fri, 22 Nov 2024 18:07:38 +0800 Subject: [PATCH 04/15] add free-benchmark auto-fix; delete scale --- .../msprobe/docs/15.free_benchmarking_PyTorch.md | 3 ++- .../msprobe/pytorch/free_benchmark/common/enums.py | 6 +++--- .../msprobe/pytorch/free_benchmark/common/utils.py | 2 +- debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py | 1 - 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md index af3b9c9fef2..6a590f4b093 100644 --- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md @@ -1,7 +1,7 @@ # PyTorch 场景的无标杆比对 ## 1 简介 -* 本工具的目标是在不依赖标杆数据的情况下,检测模型训练中可能存在的精度问题API级别算子,并提供升精度和tocpu接口快速验证。 +* 本工具的目标是在不依赖标杆数据的情况下,检测模型训练中可能存在的精度问题API级别算子,并提供升精度和tocpu接口快速验证,以及针对算子的快速恢复。 * 工具基于**数值病态分析理论**:对算子的输入增加很小的扰动,从而放大输出值异常现象;检测算子原始输出和扰动后输出间误差是否符合精度标准。 * 该工具的**特点**有: @@ -10,6 +10,7 @@ * 推荐使用场景(针对**算子精度问题**): * **暂无标杆数据**,模型Loss异常,要做精度问题算子排查; * **验证可疑算子**,要做进一步确认,验证是否对模型Loss有影响; + * **可疑算子快速恢复**,使用scale、切精度、同步等方法快速排除和恢复算子问题; * 低精度模型效果不如高精度,要做精度问题算子排查。 * 该工具的约束 * 仅支持Pytorch2.x场景; diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py index 930aeff21a1..dac78f016a0 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py @@ -8,7 +8,7 @@ class PerturbationMode: NO_CHANGE = "no_change" BIT_NOISE = "bit_noise" TO_CPU = "to_cpu" - AUTO = "auto_fix"#!guo + AUTO = "auto_fix" class DeviceType: @@ -49,7 +49,7 @@ class PytorchFreeBenchmarkConst: PerturbationMode.NO_CHANGE, PerturbationMode.BIT_NOISE, PerturbationMode.TO_CPU, - PerturbationMode.AUTO,#!guo + PerturbationMode.AUTO, ] DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU] @@ -59,7 +59,7 @@ class PytorchFreeBenchmarkConst: FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL] DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD] - FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]#!guo + FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO] DEFAULT_FUZZ_STAGE = Const.FORWARD DEFAULT_PREHEAT_STEP = 15 DEFAULT_MAX_SAMPLE = 20 diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py index 4a46ac3a91f..51f3b143443 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py @@ -78,7 +78,7 @@ class Tools: if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor): if pert_mode == PerturbationMode.AUTO: origin.data = perturbed.to(origin.device) - return origin #!guo + return origin origin.data = perturbed.to(origin.dtype).to(origin.device) return origin if isinstance(origin, dict) and isinstance(perturbed, dict): diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py index 439a4cdf5c8..1d3d6fdfe37 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py @@ -88,7 +88,6 @@ class FreeBenchmarkCheck(ABC): layer.handle(data_params) handler_params = make_handler_params(name, self.config, self.current_iter) handler = FuzzHandlerFactory.create(handler_params) - #!guo if handler_params.pert_mode == PerturbationMode.AUTO: perturbed_output = handler.handle(data_params, handler_params.pert_mode) return perturbed_output, handler.get_unequal_rows() -- Gitee From f3cb1f0d5edae7c420132e7b8e6d16315c2760be Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Fri, 22 Nov 2024 18:12:20 +0800 Subject: [PATCH 05/15] add free-benchmark auto-fix; delete scale --- .../perturbed_layers/npu/auto_fix.py | 62 +++-- tmp.py | 258 ++++++++++++++++++ 2 files changed, 303 insertions(+), 17 deletions(-) create mode 100644 tmp.py diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index 810bd33eb7f..1807323caa6 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -113,11 +113,16 @@ class AutoLayer(NpuBaseLayer): return tensor_obj def handle(self, params: DataParams): - # Try Scale + if not self.check_catastrophe(params.original_result): + params.perturbed_result = params.original_result + return params.perturbed_result + + #! Try Scale logger.info_on_rank_0( + f"[msprobe] Free benchmark: An Problem shows here.\n" f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.AUTO} of {self.api_name}." - f"Trying Scale for this" + f"Trying Scale for this." ) for x in ScaleConst.COMMUNICATION_NAMES: if x in self.api_name: @@ -145,12 +150,19 @@ class AutoLayer(NpuBaseLayer): except KeyError as e: logger.info_on_rank_0( f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") + + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, " + f"Problem solved." + ) + return params.perturbed_result - # Try Improve Precision + #! Try improve precision if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( - f"[msprobe] Free benchmark: Auto Scaler is Useless " - f"Trying to improve precision for" + f"[msprobe] Free benchmark: 'Scaler' is Useless. " + f"Trying to improve precision for " f"{PerturbationMode.AUTO} of {self.api_name}." ) new_args = self.improve_tensor_precision(params.args) @@ -158,41 +170,57 @@ class AutoLayer(NpuBaseLayer): new_kwargs = {} else: new_kwargs = self.improve_tensor_precision(params.kwargs) - # 如果输入中全为高精度、应跳过二次执行、减少多余显存引用 - if not self.is_added: - return params.perturbed_result if "inplace" in new_kwargs: new_kwargs["inplace"] = False params.perturbed_result = params.origin_func(*new_args, **new_kwargs) - # Try Synchronize + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, " + f"Problem solved." + ) + return params.perturbed_result + + #! Try Synchronize if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( - f"[msprobe] Free benchmark: Auto Scaler is Useless " - f"Trying Synchronize for" + f"[msprobe] Free benchmark: 'Improve Precision' is Useless " + f"Trying Synchronize for " f"{PerturbationMode.AUTO} of {self.api_name}." ) torch_npu.npu.synchronize() params.perturbed_result = params.origin_func(*params.args, **params.kwargs) torch_npu.npu.synchronize() + + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, " + f"Problem solved." + ) + return params.perturbed_result - # Try Contiguous + #! Try Contiguous if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( - f"[msprobe] Free benchmark: Synchronize is Useless, too " + f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. " f"Trying 'Contiguous' for" f"{PerturbationMode.AUTO} of {self.api_name}." ) new_args = self.tensor_contiguous(params.args) new_kwargs = self.tensor_contiguous(params.kwargs) - if not self.is_added: - return params.perturbed_result params.perturbed_result = params.origin_func(*new_args, **new_kwargs) - # Hint to 'tocpu' + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, " + f"Problem solved." + ) + return params.perturbed_result + + #! Hint to 'tocpu' if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( - f"[msprobe] Free benchmark: Contiguous is Useless, too " + f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. " f"Please set pert_mode to 'tocpu' for further check." ) return params.original_result diff --git a/tmp.py b/tmp.py new file mode 100644 index 00000000000..1102f8cfeb6 --- /dev/null +++ b/tmp.py @@ -0,0 +1,258 @@ +# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch_npu +from msprobe.core.common.const import Const +from msprobe.pytorch.free_benchmark import logger +from msprobe.pytorch.free_benchmark.common.constant import CommonField +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode +from msprobe.pytorch.free_benchmark.common.params import DataParams +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + +class ScaleConst: + """ + Class for ScaleLayer's const + """ + SOFTMAX_NAME = "softmax" + LINEAR_NAME = "linear" + MATMUL_NAME = "matmul" + + FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix + FP16_UB = torch.finfo(torch.float16).max + + import numpy as np + SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu() + + COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] + +class AutoLayer(NpuBaseLayer): + scale_var = 1.0 + + def check_catastrophe(self, tensor_obj): + if isinstance(tensor_obj, torch.Tensor): + if torch.all(tensor_obj.eq(0)): + return True + if torch.isinf(tensor_obj).any(): + return True + if torch.isnan(tensor_obj).any(): + return True + return False + if isinstance(tensor_obj, dict): + return any(self.check_catastrophe(value) for value in tensor_obj.values()) + if isinstance(tensor_obj, (tuple, list)): + return any(self.check_catastrophe(value) for value in tensor_obj) + return False + + def tensor_scale(self, tensor_obj,unscale=False): + if isinstance(tensor_obj, torch.Tensor): + if(unscale): + tensor_obj = self._unscale(tensor_obj) + else: + tensor_obj = self._scale(tensor_obj) + self.is_added = True + return tensor_obj + if isinstance(tensor_obj, dict): + return { + key: self.tensor_scale(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.tensor_scale(value) for value in tensor_obj] + ) + return tensor_obj + + def tensor_contiguous(self, tensor_obj): + if isinstance(tensor_obj, torch.Tensor): + return tensor_obj.contiguous() + if isinstance(tensor_obj, dict): + return { + key: self.tensor_contiguous(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.tensor_contiguous(value) for value in tensor_obj] + ) + return tensor_obj + + def improve_tensor_precision(self, tensor_obj): + if ( + isinstance(tensor_obj, torch.Tensor) + and torch.is_floating_point(tensor_obj) + and tensor_obj.dtype not in [torch.float32, torch.float64] + ): + self._set_improve_values(tensor_obj) + tensor_obj = self._change_dtype(tensor_obj) + self.is_added = True + return tensor_obj + if isinstance(tensor_obj, dict): + return { + key: self.improve_tensor_precision(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.improve_tensor_precision(value) for value in tensor_obj] + ) + return tensor_obj + + def handle(self, params: DataParams): + if not self.check_catastrophe(params.original_result): + params.perturbed_result = params.original_result + return params.perturbed_result + + #! Try Scale + logger.info_on_rank_0( + f"[msprobe] Free benchmark: An Problem shows here.\n" + f"[msprobe] Free benchmark: Perturbation is " + f"{PerturbationMode.AUTO} of {self.api_name}." + f"Trying Scale for this." + ) + for x in ScaleConst.COMMUNICATION_NAMES: + if x in self.api_name: + params.perturbed_result=(params.original_result)/AutoLayer.scale_var + AutoLayer.scale_var = 1.0 + + if(self.check_catastrophe(params.args)): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!") + params.perturbed_result = params.original_result + + if(ScaleConst.SOFTMAX_NAME in self.api_name): + new_args = params.args + else: + new_args = self.tensor_scale(params.args) + + params.perturbed_result = params.origin_func(*new_args, **params.kwargs) + + if(ScaleConst.SOFTMAX_NAME in self.api_name): + params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True) + try: + new_args1 = params.perturbed_result,*new_args[1:] + params.perturbed_result = params.origin_func(*new_args1, + **params.kwargs) + except KeyError as e: + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") + + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, " + f"Problem solved." + ) + return params.perturbed_result + + #! Try improve precision + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: 'Scaler' is Useless. " + f"Trying to improve precision for " + f"{PerturbationMode.AUTO} of {self.api_name}." + ) + new_args = self.improve_tensor_precision(params.args) + if params.fuzz_stage == Const.BACKWARD: + new_kwargs = {} + else: + new_kwargs = self.improve_tensor_precision(params.kwargs) + if "inplace" in new_kwargs: + new_kwargs["inplace"] = False + params.perturbed_result = params.origin_func(*new_args, **new_kwargs) + + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, " + f"Problem solved." + ) + return params.perturbed_result + + #! Try Synchronize + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: 'Improve Precision' is Useless " + f"Trying Synchronize for " + f"{PerturbationMode.AUTO} of {self.api_name}." + ) + torch_npu.npu.synchronize() + params.perturbed_result = params.origin_func(*params.args, **params.kwargs) + torch_npu.npu.synchronize() + + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, " + f"Problem solved." + ) + return params.perturbed_result + + #! Try Contiguous + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. " + f"Trying 'Contiguous' for" + f"{PerturbationMode.AUTO} of {self.api_name}." + ) + new_args = self.tensor_contiguous(params.args) + new_kwargs = self.tensor_contiguous(params.kwargs) + params.perturbed_result = params.origin_func(*new_args, **new_kwargs) + + if not self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, " + f"Problem solved." + ) + return params.perturbed_result + + #! Hint to 'tocpu' + if self.check_catastrophe(params.perturbed_result): + logger.info_on_rank_0( + f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. " + f"Please set pert_mode to 'tocpu' for further check." + ) + return params.original_result + + def _scale(self, inputs): + self.scale_factor = (ScaleConst.SQRT_UB + / torch.maximum(ScaleConst.SQRT_UB, + torch.norm(inputs,p=1,dim=-1).max())) + scaled_inputs = inputs * self.scale_factor + AutoLayer.scale_var *= self.scale_factor + if AutoLayer.scale_var < ScaleConst.FP16_EPS: + AutoLayer.scale_var = ScaleConst.FP16_EPS + return scaled_inputs + + def _unscale(self, output): + if(ScaleConst.SOFTMAX_NAME in self.api_name): + unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach() + AutoLayer.scale_var = 1.0 + return unscaled_outputs + def _set_improve_values(self, inputs): + if inputs.dtype in [torch.float16, torch.bfloat16]: + self.perturbed_value = torch.float32 + + def _change_dtype(self, inputs): + if hasattr(inputs, CommonField.DEVICE): + device = inputs.device + if device is CommonField.META: + new_inputs = inputs.to( + device=CommonField.META, dtype=self.perturbed_value + ) + else: + new_inputs = inputs.to(dtype=self.perturbed_value).to(device) + else: + new_inputs = inputs.to(dtype=self.perturbed_value) + return new_inputs \ No newline at end of file -- Gitee From 168c5abf2bbf6c7a280b7fcd40c3b66e81b521d5 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Mon, 25 Nov 2024 09:59:08 +0800 Subject: [PATCH 06/15] update doc02,15 --- debug/accuracy_tools/msprobe/docs/02.config_introduction.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md index f9bcf3476a8..bd07f611059 100644 --- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md +++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md @@ -106,8 +106,8 @@ PyTorch 与 MindSpore 动态图场景下,"level"须为"L1";MindSpore 静态 PyTorch 场景:指定某一类 API,对某一类的 API 进行无标杆比对。
配置示例:"list": ["relu"]。 MindSpore 场景:指定 API 名称,对列表中的 API 进行检测。
配置示例:"list": ["mindspore.mint.div", "mindspore.ops.bmm", "mindspore.Tensor.__add__"]。 fuzz_device标杆设备,str 类型。可选参数:
"npu":无标杆,通过添加扰动因子进行比对,默认值;
"cpu":以 CPU 为标杆,pert_mode 须配置为"to_cpu"(仅 PyTorch 场景支持)。
配置示例:"fuzz_device": "npu"。否 - pert_mode无标杆扰动因子,str 类型。可选参数:
"improve_precision":对输入做升精度,默认值;
"add_noise":对输入增加噪声;
"no_change":不加扰动直接二次执行;
"bit_noise":输入的末位比特翻转,MindSpore 场景不支持 BF16 类型的向量;
"change_value":输入的张量首尾值调换;
"to_cpu":在 CPU 等价执行(仅 PyTorch 场景支持)。
配置示例:"pert_mode": "improve_precision"。否 - handler_type处理类型,可选参数:
"check":进行无标杆比对检查,默认值;
"fix":将扰动后的 API 输出结果覆盖原始 API 输出结果,尝试将 Loss 曲线恢复正常,该模式下不支持预热功能与反向过程,且仅支持"improve_precision"、"to_cpu"( PyTorch 场景)两种扰动因子。
配置示例:"handler_type": "check"。否 + pert_mode无标杆扰动因子,str 类型。可选参数:
"improve_precision":对输入做升精度,默认值;
"add_noise":对输入增加噪声;
"no_change":不加扰动直接二次执行;
"bit_noise":输入的末位比特翻转,MindSpore 场景不支持 BF16 类型的向量;
"change_value":输入的张量首尾值调换;
"to_cpu":在 CPU 等价执行(仅 PyTorch 场景支持)。
"auto_fix":使用scale、切精度、同步等方法快速排除和恢复算子问题。
配置示例:"pert_mode": "improve_precision"。否 + handler_type处理类型,可选参数:
"check":进行无标杆比对检查,默认值;
"fix":将扰动后的 API 输出结果覆盖原始 API 输出结果,尝试将 Loss 曲线恢复正常,该模式下不支持预热功能与反向过程,且仅支持"improve_precision"、"to_cpu"( PyTorch 场景)、"auto_fix"( PyTorch 场景)三种扰动因子。
配置示例:"handler_type": "check"。否 fuzz_level无标杆数据 dump 级别,即选择比对结果文件应输出的表头属性,当前仅支持取值为:"L1"。输出结果详见 1.6.1 无标杆比对数据存盘格式。否 fuzz_stage比对过程,选择对 API 前向或反向进行无标杆比对,可选参数:
"forward":前向,默认值;
"backward":反向, 仅 PyTorch 场景支持。当 fuzz_stage 为 "backward" 时,handler_type 只能为 "check"。
配置示例:"fuzz_stage": "backward"。否 if_preheat预热功能(仅 PyTorch 场景支持),bool 类型。开启功能后工具可以根据每次迭代的输出调整精度算法的阈值,从而更准确地找出存在精度问题的 API。当"handler_type": "fix"时,不支持预热。可选参数:
true(开启)或 false(关闭),默认关闭。
配置示例:"if_preheat": "true"。否 -- Gitee From a7465de08b6120ebc80fa2094c2d96b527fe9579 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Mon, 25 Nov 2024 13:05:17 +0800 Subject: [PATCH 07/15] update auto_fix-tocpu --- .../msprobe/pytorch/free_benchmark/common/utils.py | 2 +- .../free_benchmark/perturbed_layers/npu/auto_fix.py | 10 +++++++++- .../free_benchmark/result_handlers/fix_handler.py | 6 +++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py index 51f3b143443..ac00bd13f6e 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py @@ -77,7 +77,7 @@ class Tools: def convert_fuzz_output_to_origin(origin, perturbed, pert_mode): if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor): if pert_mode == PerturbationMode.AUTO: - origin.data = perturbed.to(origin.device) + origin.data = perturbed return origin origin.data = perturbed.to(origin.dtype).to(origin.device) return origin diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index 1807323caa6..8f4ebda2141 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -23,6 +23,8 @@ from msprobe.pytorch.free_benchmark.common.params import DataParams from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( NpuBaseLayer, ) +from msprobe.pytorch.free_benchmark.common.utils import Tools +from msprobe.pytorch.free_benchmark.common.enums import DeviceType class ScaleConst: """ @@ -221,8 +223,14 @@ class AutoLayer(NpuBaseLayer): if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. " - f"Please set pert_mode to 'tocpu' for further check." + f"Trying 'To_cpu' for" + f"{PerturbationMode.AUTO} of {self.api_name}." ) + new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True) + new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True) + params.perturbed_result = params.origin_func(*new_args, **new_kwargs) + return params.perturbed_result + return params.original_result def _scale(self, inputs): diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py index d0b918402dd..b70b0b6bce6 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py @@ -20,17 +20,17 @@ from msprobe.pytorch.free_benchmark import logger from msprobe.pytorch.free_benchmark.common.params import DataParams from msprobe.pytorch.free_benchmark.common.utils import Tools from msprobe.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler - +from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode class FixHandler(FuzzHandler): def get_threshold(self, dtype): return self._get_default_threshold(dtype) - def handle(self, data_params: DataParams) -> Any: + def handle(self, data_params: DataParams, pert_mode: PerturbationMode = None) -> Any: try: return Tools.convert_fuzz_output_to_origin( - data_params.original_result, data_params.perturbed_result + data_params.original_result, data_params.perturbed_result, pert_mode ) except FreeBenchmarkException as e: logger.warning( -- Gitee From f12cbeee8adebf5e559e85fc36b8ace61d431a27 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Mon, 25 Nov 2024 20:40:55 +0800 Subject: [PATCH 08/15] Update scale --- .../perturbed_layers/npu/auto_fix.py | 117 +++++++++--------- 1 file changed, 56 insertions(+), 61 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index 8f4ebda2141..3a7b367627a 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -25,6 +25,7 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ) from msprobe.pytorch.free_benchmark.common.utils import Tools from msprobe.pytorch.free_benchmark.common.enums import DeviceType +from typing import Any, Callable, Dict, List, Optional, Tuple class ScaleConst: """ @@ -43,8 +44,6 @@ class ScaleConst: COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] class AutoLayer(NpuBaseLayer): - scale_var = 1.0 - def check_catastrophe(self, tensor_obj): if isinstance(tensor_obj, torch.Tensor): if torch.all(tensor_obj.eq(0)): @@ -115,56 +114,50 @@ class AutoLayer(NpuBaseLayer): return tensor_obj def handle(self, params: DataParams): - if not self.check_catastrophe(params.original_result): - params.perturbed_result = params.original_result + self.scale_factor = 1.0 + params.perturbed_result = params.original_result + if not self.check_catastrophe(params.perturbed_result): return params.perturbed_result - - #! Try Scale logger.info_on_rank_0( - f"[msprobe] Free benchmark: An Problem shows here.\n" - f"[msprobe] Free benchmark: Perturbation is " - f"{PerturbationMode.AUTO} of {self.api_name}." - f"Trying Scale for this." - ) - for x in ScaleConst.COMMUNICATION_NAMES: - if x in self.api_name: - params.perturbed_result=(params.original_result)/AutoLayer.scale_var - AutoLayer.scale_var = 1.0 - - if(self.check_catastrophe(params.args)): + f"[msprobe] Free benchmark: An Problem shows here. " + ) + #! Try Scale + if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME in self.api_name: logger.info_on_rank_0( - f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!") - params.perturbed_result = params.original_result - - if(ScaleConst.SOFTMAX_NAME in self.api_name): - new_args = params.args - else: + f"[msprobe] Free benchmark: Perturbation is " + f"{PerturbationMode.AUTO} of {self.api_name}. " + f"Trying Scale for this." + ) new_args = self.tensor_scale(params.args) - - params.perturbed_result = params.origin_func(*new_args, **params.kwargs) - - if(ScaleConst.SOFTMAX_NAME in self.api_name): - params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True) - try: - new_args1 = params.perturbed_result,*new_args[1:] - params.perturbed_result = params.origin_func(*new_args1, - **params.kwargs) - except KeyError as e: + params.perturbed_result = params.origin_func( + *new_args, **params.kwargs) + + if (ScaleConst.SOFTMAX_NAME in self.api_name): + params.perturbed_result = self.tensor_scale( + params.perturbed_result, unscale=True) + try: + new_args1 = params.perturbed_result, *new_args[1:] + params.perturbed_result = params.origin_func(*new_args1, + **params.kwargs) + except KeyError as e: + logger.info_on_rank_0( + f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") + else: + params.perturbed_result = self.tensor_scale( + params.perturbed_result, unscale=True) + + if not self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( - f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") - - if not self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, " - f"Problem solved." - ) - return params.perturbed_result + f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, " + f"Problem solved." + ) + return params.perturbed_result #! Try improve precision if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( f"[msprobe] Free benchmark: 'Scaler' is Useless. " - f"Trying to improve precision for " + f"Trying 'improve precision' for " f"{PerturbationMode.AUTO} of {self.api_name}." ) new_args = self.improve_tensor_precision(params.args) @@ -187,7 +180,7 @@ class AutoLayer(NpuBaseLayer): if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( f"[msprobe] Free benchmark: 'Improve Precision' is Useless " - f"Trying Synchronize for " + f"Trying 'Synchronize' for " f"{PerturbationMode.AUTO} of {self.api_name}." ) torch_npu.npu.synchronize() @@ -223,31 +216,33 @@ class AutoLayer(NpuBaseLayer): if self.check_catastrophe(params.perturbed_result): logger.info_on_rank_0( f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. " - f"Trying 'To_cpu' for" - f"{PerturbationMode.AUTO} of {self.api_name}." + f"Please set pert_mode to 'To_cpu' for further check." ) - new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True) - new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True) - params.perturbed_result = params.origin_func(*new_args, **new_kwargs) - return params.perturbed_result - return params.original_result + def _get_scale_factor(self, inputs): + upper = ScaleConst.SQRT_UB / 2 + lower = torch.maximum(ScaleConst.SQRT_UB, + torch.norm(inputs, p=1, dim=-1).max()) + return upper / lower + def _scale(self, inputs): - self.scale_factor = (ScaleConst.SQRT_UB - / torch.maximum(ScaleConst.SQRT_UB, - torch.norm(inputs,p=1,dim=-1).max())) - scaled_inputs = inputs * self.scale_factor - AutoLayer.scale_var *= self.scale_factor - if AutoLayer.scale_var < ScaleConst.FP16_EPS: - AutoLayer.scale_var = ScaleConst.FP16_EPS + cur_scale = self._get_scale_factor(inputs) + self.scale_factor = max(ScaleConst.FP16_EPS, + cur_scale * self.scale_factor) + scaled_inputs = inputs * cur_scale return scaled_inputs def _unscale(self, output): - if(ScaleConst.SOFTMAX_NAME in self.api_name): - unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach() - AutoLayer.scale_var = 1.0 - return unscaled_outputs + if (ScaleConst.SOFTMAX_NAME in self.api_name): + unscaled_outputs = ( + torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach() + else: + rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB, + torch.norm(output, p=1, dim=-1).max()) + unscaled_outputs = rescale_coeff * output + return unscaled_outputs + def _set_improve_values(self, inputs): if inputs.dtype in [torch.float16, torch.bfloat16]: self.perturbed_value = torch.float32 -- Gitee From faf46ff3d18d3920e521719662204c7ddcca8082 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Tue, 26 Nov 2024 11:53:44 +0800 Subject: [PATCH 09/15] bmm adding --- .../pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index 3a7b367627a..b5b4ff78e6e 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -34,6 +34,7 @@ class ScaleConst: SOFTMAX_NAME = "softmax" LINEAR_NAME = "linear" MATMUL_NAME = "matmul" + BMM_NAME = "bmm" FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix FP16_UB = torch.finfo(torch.float16).max @@ -122,7 +123,7 @@ class AutoLayer(NpuBaseLayer): f"[msprobe] Free benchmark: An Problem shows here. " ) #! Try Scale - if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME in self.api_name: + if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name: logger.info_on_rank_0( f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.AUTO} of {self.api_name}. " -- Gitee From edfa2d8e0de72014042b53f8a12fb4fae9915bfe Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Tue, 26 Nov 2024 12:11:22 +0800 Subject: [PATCH 10/15] bmm adding --- .../perturbed_layers/npu/auto_fix.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index b5b4ff78e6e..73ea09c2abc 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -26,7 +26,7 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import from msprobe.pytorch.free_benchmark.common.utils import Tools from msprobe.pytorch.free_benchmark.common.enums import DeviceType from typing import Any, Callable, Dict, List, Optional, Tuple - +import numpy as np class ScaleConst: """ Class for ScaleLayer's const @@ -39,8 +39,8 @@ class ScaleConst: FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix FP16_UB = torch.finfo(torch.float16).max - import numpy as np - SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu() + + SQRT_UB = np.sqrt(FP16_UB) COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] @@ -222,10 +222,12 @@ class AutoLayer(NpuBaseLayer): return params.original_result def _get_scale_factor(self, inputs): - upper = ScaleConst.SQRT_UB / 2 - lower = torch.maximum(ScaleConst.SQRT_UB, - torch.norm(inputs, p=1, dim=-1).max()) - return upper / lower + nominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu() + max_denominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu() + denominator = torch.maximum(max_denominator, torch.norm(inputs, p=1, dim=-1).max()) + # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB) + min_scale_factor = torch.tensor([1 / ScaleConst.SQRT_UB], dtype=torch.float16).npu() + return torch.maximum(nominator / denominator, min_scale_factor) def _scale(self, inputs): cur_scale = self._get_scale_factor(inputs) @@ -236,13 +238,12 @@ class AutoLayer(NpuBaseLayer): def _unscale(self, output): if (ScaleConst.SOFTMAX_NAME in self.api_name): - unscaled_outputs = ( - torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach() + unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor) + else: - rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB, - torch.norm(output, p=1, dim=-1).max()) - unscaled_outputs = rescale_coeff * output - return unscaled_outputs + rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB) + unscaled_outputs = output / rescale_factor + unscaled_outputs = torch.nan_to_num(unscaled_outputs, posinf=np.sqrt(ScaleConst.FP16_UB)) def _set_improve_values(self, inputs): if inputs.dtype in [torch.float16, torch.bfloat16]: -- Gitee From 17579b085e5d743e607df9067a6e32bc99c441ef Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Tue, 26 Nov 2024 14:32:48 +0800 Subject: [PATCH 11/15] update 4 OP --- .../perturbed_layers/npu/auto_fix.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index 73ea09c2abc..a5c1cd6d163 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -40,7 +40,7 @@ class ScaleConst: FP16_UB = torch.finfo(torch.float16).max - SQRT_UB = np.sqrt(FP16_UB) + SQRT_UB = torch.tensor([np.sqrt(FP16_UB)], dtype=torch.float16).npu() COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] @@ -122,6 +122,7 @@ class AutoLayer(NpuBaseLayer): logger.info_on_rank_0( f"[msprobe] Free benchmark: An Problem shows here. " ) + #! Try Scale if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name: logger.info_on_rank_0( @@ -222,12 +223,10 @@ class AutoLayer(NpuBaseLayer): return params.original_result def _get_scale_factor(self, inputs): - nominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu() - max_denominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu() - denominator = torch.maximum(max_denominator, torch.norm(inputs, p=1, dim=-1).max()) - # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB) - min_scale_factor = torch.tensor([1 / ScaleConst.SQRT_UB], dtype=torch.float16).npu() - return torch.maximum(nominator / denominator, min_scale_factor) + upper = ScaleConst.SQRT_UB / 2 + lower = torch.maximum(ScaleConst.SQRT_UB, + torch.norm(inputs, p=1, dim=-1).max()) + return upper / lower def _scale(self, inputs): cur_scale = self._get_scale_factor(inputs) @@ -235,15 +234,16 @@ class AutoLayer(NpuBaseLayer): cur_scale * self.scale_factor) scaled_inputs = inputs * cur_scale return scaled_inputs - + def _unscale(self, output): if (ScaleConst.SOFTMAX_NAME in self.api_name): - unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor) - + unscaled_outputs = ( + torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach() else: - rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB) - unscaled_outputs = output / rescale_factor - unscaled_outputs = torch.nan_to_num(unscaled_outputs, posinf=np.sqrt(ScaleConst.FP16_UB)) + rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB, + torch.norm(output, p=1, dim=-1).max()) + unscaled_outputs = rescale_coeff * output + return unscaled_outputs def _set_improve_values(self, inputs): if inputs.dtype in [torch.float16, torch.bfloat16]: -- Gitee From 61b6f056e98cd89ad6ff988300811047c2d0b336 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Tue, 26 Nov 2024 14:38:52 +0800 Subject: [PATCH 12/15] update readme --- debug/accuracy_tools/msprobe/docs/02.config_introduction.md | 2 +- .../accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md index bd07f611059..a5d9e27062f 100644 --- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md +++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md @@ -103,7 +103,7 @@ PyTorch 与 MindSpore 动态图场景下,"level"须为"L1";MindSpore 静态 参数解释是否必选 scope自定义检测 API 列表(仅 PyTorch 场景支持),list[str] 类型,默认值为空列表,当 list 也为空列表时,表示检测所有 API。需要在 [ ] 内配置具体 API 名(在 dump 的结果中查看)。与 list 参数不能同时配置。
配置示例:"scope": ["Torch.matmul.0.forward", "Tensor.pow.4.forward"]。否 list自定义检测 API 类型或 API 名称,list[str] 类型,默认值为空列表,表示检测所有 API(PyTorch 场景下还需 scope 也为空列表)。与 scope 参数不能同时配置。否 - PyTorch 场景:指定某一类 API,对某一类的 API 进行无标杆比对。
配置示例:"list": ["relu"]。 + PyTorch 场景:指定某一类 API,对某一类的 API 进行无标杆比对。
配置示例:"list": ["relu"]。针对任务auto_fix, 须知其中scale功能支持matmul,bmm,softmax,linear,其他算子将跳过scale使用其他修复方法。 MindSpore 场景:指定 API 名称,对列表中的 API 进行检测。
配置示例:"list": ["mindspore.mint.div", "mindspore.ops.bmm", "mindspore.Tensor.__add__"]。 fuzz_device标杆设备,str 类型。可选参数:
"npu":无标杆,通过添加扰动因子进行比对,默认值;
"cpu":以 CPU 为标杆,pert_mode 须配置为"to_cpu"(仅 PyTorch 场景支持)。
配置示例:"fuzz_device": "npu"。否 pert_mode无标杆扰动因子,str 类型。可选参数:
"improve_precision":对输入做升精度,默认值;
"add_noise":对输入增加噪声;
"no_change":不加扰动直接二次执行;
"bit_noise":输入的末位比特翻转,MindSpore 场景不支持 BF16 类型的向量;
"change_value":输入的张量首尾值调换;
"to_cpu":在 CPU 等价执行(仅 PyTorch 场景支持)。
"auto_fix":使用scale、切精度、同步等方法快速排除和恢复算子问题。
配置示例:"pert_mode": "improve_precision"。否 diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md index 6a590f4b093..bd345813400 100644 --- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md @@ -88,7 +88,7 @@ D-->config.json配置 - +
参数是否必选可配置项适用场景
scope自定义需要通过指定算子名来限制算子插桩范围 如:["Torch.matmul.0.forward", "Tensor.pow.4.forward"]。
list自定义需要通过指定算子类型来限制算子插桩范围 如:["relu"] 会匹配所有算子名中包含relu的算子。
list自定义需要通过指定算子类型来限制算子插桩范围 如:["relu"] 会匹配所有算子名中包含relu的算子。针对任务auto_fix, 须知其中scale功能支持matmul,bmm,softmax,linear,其他算子将跳过scale使用其他修复方法。
fuzz_stage"forward"(默认)需要进行算子前向计算的精度问题排查或验证可疑算子。
"backward"需要进行算子反向计算的精度问题排查,不支持仅反向验证,前向验证包括反向。
-- Gitee From 2f8c3111a160ee6076c5c69e5fd1c9afa732bf1c Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Thu, 28 Nov 2024 10:44:25 +0800 Subject: [PATCH 13/15] backward scale fix support --- .../pytorch/free_benchmark/common/utils.py | 2 +- .../perturbed_layers/npu/auto_fix.py | 25 +++++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py index ac00bd13f6e..3dbded07e94 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py @@ -77,7 +77,7 @@ class Tools: def convert_fuzz_output_to_origin(origin, perturbed, pert_mode): if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor): if pert_mode == PerturbationMode.AUTO: - origin.data = perturbed + origin = perturbed return origin origin.data = perturbed.to(origin.dtype).to(origin.device) return origin diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index a5c1cd6d163..a210e2aaf69 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -41,6 +41,7 @@ class ScaleConst: SQRT_UB = torch.tensor([np.sqrt(FP16_UB)], dtype=torch.float16).npu() + SQRT_UB_INV = torch.tensor([1 / np.sqrt(FP16_UB)], dtype=torch.float16).npu() COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] @@ -221,12 +222,17 @@ class AutoLayer(NpuBaseLayer): f"Please set pert_mode to 'To_cpu' for further check." ) return params.original_result - + def _get_scale_factor(self, inputs): - upper = ScaleConst.SQRT_UB / 2 - lower = torch.maximum(ScaleConst.SQRT_UB, - torch.norm(inputs, p=1, dim=-1).max()) - return upper / lower + nominator = ScaleConst.SQRT_UB + x_norm = torch.norm(inputs, p=1, dim=-1).max() + if(torch.isfinite(x_norm).all()): + denominator = torch.maximum(ScaleConst.SQRT_UB, x_norm) + else: + # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB) + return ScaleConst.SQRT_UB_INV.to(torch.get_device(inputs)) + computed_scale = (nominator / denominator).to(torch.get_device(inputs)) + return computed_scale def _scale(self, inputs): cur_scale = self._get_scale_factor(inputs) @@ -237,12 +243,11 @@ class AutoLayer(NpuBaseLayer): def _unscale(self, output): if (ScaleConst.SOFTMAX_NAME in self.api_name): - unscaled_outputs = ( - torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach() + unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor) else: - rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB, - torch.norm(output, p=1, dim=-1).max()) - unscaled_outputs = rescale_coeff * output + rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB) + unscaled_outputs = output / rescale_factor + return unscaled_outputs def _set_improve_values(self, inputs): -- Gitee From 870867b0459c06d46e4e0f997a2715aa92e6df26 Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Tue, 24 Dec 2024 11:53:14 +0800 Subject: [PATCH 14/15] check-cla --- .../perturbed_layers/npu/auto_fix.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py index a210e2aaf69..4941af49719 100644 --- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py +++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py @@ -15,18 +15,19 @@ import torch import torch_npu +import numpy as np + +from typing import Any, Callable, Dict, List, Optional, Tuple + +from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import NpuBaseLayer +from msprobe.pytorch.free_benchmark.common.utils import Tools +from msprobe.pytorch.free_benchmark.common.enums import DeviceType from msprobe.core.common.const import Const from msprobe.pytorch.free_benchmark import logger from msprobe.pytorch.free_benchmark.common.constant import CommonField from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode from msprobe.pytorch.free_benchmark.common.params import DataParams -from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( - NpuBaseLayer, -) -from msprobe.pytorch.free_benchmark.common.utils import Tools -from msprobe.pytorch.free_benchmark.common.enums import DeviceType -from typing import Any, Callable, Dict, List, Optional, Tuple -import numpy as np + class ScaleConst: """ Class for ScaleLayer's const @@ -116,6 +117,13 @@ class AutoLayer(NpuBaseLayer): return tensor_obj def handle(self, params: DataParams): + is_scale_applicable = ( + ScaleConst.SOFTMAX_NAME in self.api_name or + ScaleConst.LINEAR_NAME in self.api_name or + ScaleConst.MATMUL_NAME in self.api_name or + ScaleConst.BMM_NAME in self.api_name + ) + self.scale_factor = 1.0 params.perturbed_result = params.original_result if not self.check_catastrophe(params.perturbed_result): @@ -125,7 +133,7 @@ class AutoLayer(NpuBaseLayer): ) #! Try Scale - if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name: + if is_scale_applicable: logger.info_on_rank_0( f"[msprobe] Free benchmark: Perturbation is " f"{PerturbationMode.AUTO} of {self.api_name}. " @@ -244,11 +252,11 @@ class AutoLayer(NpuBaseLayer): def _unscale(self, output): if (ScaleConst.SOFTMAX_NAME in self.api_name): unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor) + return unscaled_outputs else: rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB) unscaled_outputs = output / rescale_factor - - return unscaled_outputs + return unscaled_outputs def _set_improve_values(self, inputs): if inputs.dtype in [torch.float16, torch.bfloat16]: -- Gitee From 67949d7af3ade77b1c0d4bfff508b9cd549cab3c Mon Sep 17 00:00:00 2001 From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com> Date: Tue, 24 Dec 2024 14:28:07 +0800 Subject: [PATCH 15/15] 12/24 --- tmp.py | 258 --------------------------------------------------------- 1 file changed, 258 deletions(-) delete mode 100644 tmp.py diff --git a/tmp.py b/tmp.py deleted file mode 100644 index 1102f8cfeb6..00000000000 --- a/tmp.py +++ /dev/null @@ -1,258 +0,0 @@ -# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch_npu -from msprobe.core.common.const import Const -from msprobe.pytorch.free_benchmark import logger -from msprobe.pytorch.free_benchmark.common.constant import CommonField -from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode -from msprobe.pytorch.free_benchmark.common.params import DataParams -from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( - NpuBaseLayer, -) - -class ScaleConst: - """ - Class for ScaleLayer's const - """ - SOFTMAX_NAME = "softmax" - LINEAR_NAME = "linear" - MATMUL_NAME = "matmul" - - FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix - FP16_UB = torch.finfo(torch.float16).max - - import numpy as np - SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu() - - COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"] - -class AutoLayer(NpuBaseLayer): - scale_var = 1.0 - - def check_catastrophe(self, tensor_obj): - if isinstance(tensor_obj, torch.Tensor): - if torch.all(tensor_obj.eq(0)): - return True - if torch.isinf(tensor_obj).any(): - return True - if torch.isnan(tensor_obj).any(): - return True - return False - if isinstance(tensor_obj, dict): - return any(self.check_catastrophe(value) for value in tensor_obj.values()) - if isinstance(tensor_obj, (tuple, list)): - return any(self.check_catastrophe(value) for value in tensor_obj) - return False - - def tensor_scale(self, tensor_obj,unscale=False): - if isinstance(tensor_obj, torch.Tensor): - if(unscale): - tensor_obj = self._unscale(tensor_obj) - else: - tensor_obj = self._scale(tensor_obj) - self.is_added = True - return tensor_obj - if isinstance(tensor_obj, dict): - return { - key: self.tensor_scale(value) - for key, value in tensor_obj.items() - } - if isinstance(tensor_obj, (tuple, list)): - return type(tensor_obj)( - [self.tensor_scale(value) for value in tensor_obj] - ) - return tensor_obj - - def tensor_contiguous(self, tensor_obj): - if isinstance(tensor_obj, torch.Tensor): - return tensor_obj.contiguous() - if isinstance(tensor_obj, dict): - return { - key: self.tensor_contiguous(value) - for key, value in tensor_obj.items() - } - if isinstance(tensor_obj, (tuple, list)): - return type(tensor_obj)( - [self.tensor_contiguous(value) for value in tensor_obj] - ) - return tensor_obj - - def improve_tensor_precision(self, tensor_obj): - if ( - isinstance(tensor_obj, torch.Tensor) - and torch.is_floating_point(tensor_obj) - and tensor_obj.dtype not in [torch.float32, torch.float64] - ): - self._set_improve_values(tensor_obj) - tensor_obj = self._change_dtype(tensor_obj) - self.is_added = True - return tensor_obj - if isinstance(tensor_obj, dict): - return { - key: self.improve_tensor_precision(value) - for key, value in tensor_obj.items() - } - if isinstance(tensor_obj, (tuple, list)): - return type(tensor_obj)( - [self.improve_tensor_precision(value) for value in tensor_obj] - ) - return tensor_obj - - def handle(self, params: DataParams): - if not self.check_catastrophe(params.original_result): - params.perturbed_result = params.original_result - return params.perturbed_result - - #! Try Scale - logger.info_on_rank_0( - f"[msprobe] Free benchmark: An Problem shows here.\n" - f"[msprobe] Free benchmark: Perturbation is " - f"{PerturbationMode.AUTO} of {self.api_name}." - f"Trying Scale for this." - ) - for x in ScaleConst.COMMUNICATION_NAMES: - if x in self.api_name: - params.perturbed_result=(params.original_result)/AutoLayer.scale_var - AutoLayer.scale_var = 1.0 - - if(self.check_catastrophe(params.args)): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!") - params.perturbed_result = params.original_result - - if(ScaleConst.SOFTMAX_NAME in self.api_name): - new_args = params.args - else: - new_args = self.tensor_scale(params.args) - - params.perturbed_result = params.origin_func(*new_args, **params.kwargs) - - if(ScaleConst.SOFTMAX_NAME in self.api_name): - params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True) - try: - new_args1 = params.perturbed_result,*new_args[1:] - params.perturbed_result = params.origin_func(*new_args1, - **params.kwargs) - except KeyError as e: - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!") - - if not self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, " - f"Problem solved." - ) - return params.perturbed_result - - #! Try improve precision - if self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: 'Scaler' is Useless. " - f"Trying to improve precision for " - f"{PerturbationMode.AUTO} of {self.api_name}." - ) - new_args = self.improve_tensor_precision(params.args) - if params.fuzz_stage == Const.BACKWARD: - new_kwargs = {} - else: - new_kwargs = self.improve_tensor_precision(params.kwargs) - if "inplace" in new_kwargs: - new_kwargs["inplace"] = False - params.perturbed_result = params.origin_func(*new_args, **new_kwargs) - - if not self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, " - f"Problem solved." - ) - return params.perturbed_result - - #! Try Synchronize - if self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: 'Improve Precision' is Useless " - f"Trying Synchronize for " - f"{PerturbationMode.AUTO} of {self.api_name}." - ) - torch_npu.npu.synchronize() - params.perturbed_result = params.origin_func(*params.args, **params.kwargs) - torch_npu.npu.synchronize() - - if not self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, " - f"Problem solved." - ) - return params.perturbed_result - - #! Try Contiguous - if self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. " - f"Trying 'Contiguous' for" - f"{PerturbationMode.AUTO} of {self.api_name}." - ) - new_args = self.tensor_contiguous(params.args) - new_kwargs = self.tensor_contiguous(params.kwargs) - params.perturbed_result = params.origin_func(*new_args, **new_kwargs) - - if not self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, " - f"Problem solved." - ) - return params.perturbed_result - - #! Hint to 'tocpu' - if self.check_catastrophe(params.perturbed_result): - logger.info_on_rank_0( - f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. " - f"Please set pert_mode to 'tocpu' for further check." - ) - return params.original_result - - def _scale(self, inputs): - self.scale_factor = (ScaleConst.SQRT_UB - / torch.maximum(ScaleConst.SQRT_UB, - torch.norm(inputs,p=1,dim=-1).max())) - scaled_inputs = inputs * self.scale_factor - AutoLayer.scale_var *= self.scale_factor - if AutoLayer.scale_var < ScaleConst.FP16_EPS: - AutoLayer.scale_var = ScaleConst.FP16_EPS - return scaled_inputs - - def _unscale(self, output): - if(ScaleConst.SOFTMAX_NAME in self.api_name): - unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach() - AutoLayer.scale_var = 1.0 - return unscaled_outputs - def _set_improve_values(self, inputs): - if inputs.dtype in [torch.float16, torch.bfloat16]: - self.perturbed_value = torch.float32 - - def _change_dtype(self, inputs): - if hasattr(inputs, CommonField.DEVICE): - device = inputs.device - if device is CommonField.META: - new_inputs = inputs.to( - device=CommonField.META, dtype=self.perturbed_value - ) - else: - new_inputs = inputs.to(dtype=self.perturbed_value).to(device) - else: - new_inputs = inputs.to(dtype=self.perturbed_value) - return new_inputs \ No newline at end of file -- Gitee