From b79e5bb98c488006a3075ee4c4f079e99ddc2333 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 11:33:20 +0800
Subject: [PATCH 01/15] test
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index ea548a0bfca..623c205332c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🚨 重要通知
+# 🚨 重要通知:
**1. Ascend Training Tools 更名为 MindStudio Training Tools (mstt)。**
--
Gitee
From e76ef748d55fd6d00339e2db25d67ab008fb249b Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 12:12:13 +0800
Subject: [PATCH 02/15] add free-benchmark auto-fix
---
.../pytorch/free_benchmark/common/enums.py | 6 +-
.../pytorch/free_benchmark/common/utils.py | 7 +-
.../msprobe/pytorch/free_benchmark/main.py | 3 +
.../perturbed_layers/layer_factory.py | 5 +-
.../perturbed_layers/npu/auto_fix.py | 230 ++++++++++++++++++
.../perturbed_layers/npu/scale.py | 132 ++++++++++
6 files changed, 379 insertions(+), 4 deletions(-)
create mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
create mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
index 181631c624a..cb1654683f9 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
@@ -8,6 +8,8 @@ class PerturbationMode:
NO_CHANGE = "no_change"
BIT_NOISE = "bit_noise"
TO_CPU = "to_cpu"
+ AUTO = "auto_fix"
+ SCALE = "scale"
class DeviceType:
@@ -48,6 +50,8 @@ class PytorchFreeBenchmarkConst:
PerturbationMode.NO_CHANGE,
PerturbationMode.BIT_NOISE,
PerturbationMode.TO_CPU,
+ PerturbationMode.AUTO,
+ PerturbationMode.SCALE
]
DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION
DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU]
@@ -57,7 +61,7 @@ class PytorchFreeBenchmarkConst:
FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL]
DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL
FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD]
- FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU]
+ FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO, PerturbationMode.SCALE]
DEFAULT_FUZZ_STAGE = Const.FORWARD
DEFAULT_PREHEAT_STEP = 15
DEFAULT_MAX_SAMPLE = 20
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index 391c2ceaca0..4a46ac3a91f 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -16,7 +16,7 @@
import torch
from msprobe.core.common.exceptions import FreeBenchmarkException
from msprobe.pytorch.free_benchmark.common.enums import DeviceType
-
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
class Tools:
@@ -74,8 +74,11 @@ class Tools:
return tensor_seq
@staticmethod
- def convert_fuzz_output_to_origin(origin, perturbed):
+ def convert_fuzz_output_to_origin(origin, perturbed, pert_mode):
if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
+ if pert_mode == PerturbationMode.AUTO:
+ origin.data = perturbed.to(origin.device)
+ return origin #!guo
origin.data = perturbed.to(origin.dtype).to(origin.device)
return origin
if isinstance(origin, dict) and isinstance(perturbed, dict):
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
index 66d7b7e1042..1d3d6fdfe37 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
@@ -88,6 +88,9 @@ class FreeBenchmarkCheck(ABC):
layer.handle(data_params)
handler_params = make_handler_params(name, self.config, self.current_iter)
handler = FuzzHandlerFactory.create(handler_params)
+ if handler_params.pert_mode == PerturbationMode.AUTO:
+ perturbed_output = handler.handle(data_params, handler_params.pert_mode)
+ return perturbed_output, handler.get_unequal_rows()
perturbed_output = handler.handle(data_params)
return perturbed_output, handler.get_unequal_rows()
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
index 79256cd4063..dcbd6739fc8 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
@@ -25,7 +25,8 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.improve_precision impor
)
from msprobe.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer
from msprobe.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer
-
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.auto_fix import AutoLayer
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.scale import ScaleLayer
class LayerFactory:
layers = {
@@ -35,6 +36,8 @@ class LayerFactory:
PerturbationMode.NO_CHANGE: NoChangeLayer,
PerturbationMode.BIT_NOISE: BitNoiseLayer,
PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer,
+ PerturbationMode.AUTO: AutoLayer,
+ PerturbationMode.SCALE: ScaleLayer,
},
DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer},
}
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
new file mode 100644
index 00000000000..810bd33eb7f
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -0,0 +1,230 @@
+# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from msprobe.core.common.const import Const
+from msprobe.pytorch.free_benchmark import logger
+from msprobe.pytorch.free_benchmark.common.constant import CommonField
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
+from msprobe.pytorch.free_benchmark.common.params import DataParams
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+ NpuBaseLayer,
+)
+
+class ScaleConst:
+ """
+ Class for ScaleLayer's const
+ """
+ SOFTMAX_NAME = "softmax"
+ LINEAR_NAME = "linear"
+ MATMUL_NAME = "matmul"
+
+ FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
+ FP16_UB = torch.finfo(torch.float16).max
+
+ import numpy as np
+ SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+
+ COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
+
+class AutoLayer(NpuBaseLayer):
+ scale_var = 1.0
+
+ def check_catastrophe(self, tensor_obj):
+ if isinstance(tensor_obj, torch.Tensor):
+ if torch.all(tensor_obj.eq(0)):
+ return True
+ if torch.isinf(tensor_obj).any():
+ return True
+ if torch.isnan(tensor_obj).any():
+ return True
+ return False
+ if isinstance(tensor_obj, dict):
+ return any(self.check_catastrophe(value) for value in tensor_obj.values())
+ if isinstance(tensor_obj, (tuple, list)):
+ return any(self.check_catastrophe(value) for value in tensor_obj)
+ return False
+
+ def tensor_scale(self, tensor_obj,unscale=False):
+ if isinstance(tensor_obj, torch.Tensor):
+ if(unscale):
+ tensor_obj = self._unscale(tensor_obj)
+ else:
+ tensor_obj = self._scale(tensor_obj)
+ self.is_added = True
+ return tensor_obj
+ if isinstance(tensor_obj, dict):
+ return {
+ key: self.tensor_scale(value)
+ for key, value in tensor_obj.items()
+ }
+ if isinstance(tensor_obj, (tuple, list)):
+ return type(tensor_obj)(
+ [self.tensor_scale(value) for value in tensor_obj]
+ )
+ return tensor_obj
+
+ def tensor_contiguous(self, tensor_obj):
+ if isinstance(tensor_obj, torch.Tensor):
+ return tensor_obj.contiguous()
+ if isinstance(tensor_obj, dict):
+ return {
+ key: self.tensor_contiguous(value)
+ for key, value in tensor_obj.items()
+ }
+ if isinstance(tensor_obj, (tuple, list)):
+ return type(tensor_obj)(
+ [self.tensor_contiguous(value) for value in tensor_obj]
+ )
+ return tensor_obj
+
+ def improve_tensor_precision(self, tensor_obj):
+ if (
+ isinstance(tensor_obj, torch.Tensor)
+ and torch.is_floating_point(tensor_obj)
+ and tensor_obj.dtype not in [torch.float32, torch.float64]
+ ):
+ self._set_improve_values(tensor_obj)
+ tensor_obj = self._change_dtype(tensor_obj)
+ self.is_added = True
+ return tensor_obj
+ if isinstance(tensor_obj, dict):
+ return {
+ key: self.improve_tensor_precision(value)
+ for key, value in tensor_obj.items()
+ }
+ if isinstance(tensor_obj, (tuple, list)):
+ return type(tensor_obj)(
+ [self.improve_tensor_precision(value) for value in tensor_obj]
+ )
+ return tensor_obj
+
+ def handle(self, params: DataParams):
+ # Try Scale
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Perturbation is "
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ f"Trying Scale for this"
+ )
+ for x in ScaleConst.COMMUNICATION_NAMES:
+ if x in self.api_name:
+ params.perturbed_result=(params.original_result)/AutoLayer.scale_var
+ AutoLayer.scale_var = 1.0
+
+ if(self.check_catastrophe(params.args)):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
+ params.perturbed_result = params.original_result
+
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ new_args = params.args
+ else:
+ new_args = self.tensor_scale(params.args)
+
+ params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
+
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)
+ try:
+ new_args1 = params.perturbed_result,*new_args[1:]
+ params.perturbed_result = params.origin_func(*new_args1,
+ **params.kwargs)
+ except KeyError as e:
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+
+ # Try Improve Precision
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Auto Scaler is Useless "
+ f"Trying to improve precision for"
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ )
+ new_args = self.improve_tensor_precision(params.args)
+ if params.fuzz_stage == Const.BACKWARD:
+ new_kwargs = {}
+ else:
+ new_kwargs = self.improve_tensor_precision(params.kwargs)
+ # 如果输入中全为高精度、应跳过二次执行、减少多余显存引用
+ if not self.is_added:
+ return params.perturbed_result
+ if "inplace" in new_kwargs:
+ new_kwargs["inplace"] = False
+ params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+
+ # Try Synchronize
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Auto Scaler is Useless "
+ f"Trying Synchronize for"
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ )
+ torch_npu.npu.synchronize()
+ params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
+ torch_npu.npu.synchronize()
+
+ # Try Contiguous
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Synchronize is Useless, too "
+ f"Trying 'Contiguous' for"
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ )
+ new_args = self.tensor_contiguous(params.args)
+ new_kwargs = self.tensor_contiguous(params.kwargs)
+ if not self.is_added:
+ return params.perturbed_result
+ params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+
+ # Hint to 'tocpu'
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Contiguous is Useless, too "
+ f"Please set pert_mode to 'tocpu' for further check."
+ )
+ return params.original_result
+
+ def _scale(self, inputs):
+ self.scale_factor = (ScaleConst.SQRT_UB
+ / torch.maximum(ScaleConst.SQRT_UB,
+ torch.norm(inputs,p=1,dim=-1).max()))
+ scaled_inputs = inputs * self.scale_factor
+ AutoLayer.scale_var *= self.scale_factor
+ if AutoLayer.scale_var < ScaleConst.FP16_EPS:
+ AutoLayer.scale_var = ScaleConst.FP16_EPS
+ return scaled_inputs
+
+ def _unscale(self, output):
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
+ AutoLayer.scale_var = 1.0
+ return unscaled_outputs
+ def _set_improve_values(self, inputs):
+ if inputs.dtype in [torch.float16, torch.bfloat16]:
+ self.perturbed_value = torch.float32
+
+ def _change_dtype(self, inputs):
+ if hasattr(inputs, CommonField.DEVICE):
+ device = inputs.device
+ if device is CommonField.META:
+ new_inputs = inputs.to(
+ device=CommonField.META, dtype=self.perturbed_value
+ )
+ else:
+ new_inputs = inputs.to(dtype=self.perturbed_value).to(device)
+ else:
+ new_inputs = inputs.to(dtype=self.perturbed_value)
+ return new_inputs
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
new file mode 100644
index 00000000000..beaaa6c34bf
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
@@ -0,0 +1,132 @@
+# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!guo @Rodin
+import torch
+import torch_npu
+
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from msprobe.core.common.const import Const
+from msprobe.pytorch.free_benchmark import logger
+from msprobe.pytorch.free_benchmark.common.constant import CommonField
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
+from msprobe.pytorch.free_benchmark.common.params import DataParams
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+ NpuBaseLayer,
+)
+
+class ScaleConst:
+ """
+ Class for ScaleLayer's const
+ """
+ SOFTMAX_NAME = "softmax"
+ LINEAR_NAME = "linear"
+ MATMUL_NAME = "matmul"
+
+ FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
+ FP16_UB = torch.finfo(torch.float16).max
+
+ import numpy as np
+ SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+
+ COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
+
+
+class ScaleLayer(NpuBaseLayer):
+ var = 1.0
+
+ def is_nonvalid(self, tensor_obj):
+ if isinstance(tensor_obj, torch.Tensor):
+ if not torch.isfinite(tensor_obj).all():
+ return True
+ return False
+ if isinstance(tensor_obj, dict):
+ return any(self.is_nonvalid(value) for value in tensor_obj.values())
+ if isinstance(tensor_obj, (tuple, list)):
+ return any(self.is_nonvalid(value) for value in tensor_obj)
+ return False
+
+ def tensor_scale(self, tensor_obj,unscale=False):
+ if isinstance(tensor_obj, torch.Tensor):
+ if(unscale):
+ tensor_obj = self._unscale(tensor_obj)
+ else:
+ tensor_obj = self._scale(tensor_obj)
+ self.is_added = True
+ return tensor_obj
+ if isinstance(tensor_obj, dict):
+ return {
+ key: self.tensor_scale(value)
+ for key, value in tensor_obj.items()
+ }
+ if isinstance(tensor_obj, (tuple, list)):
+ return type(tensor_obj)(
+ [self.tensor_scale(value) for value in tensor_obj]
+ )
+ return tensor_obj
+
+ def handle(self, params: DataParams):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Perturbation is "
+ f"{PerturbationMode.SCALE} of {self.api_name}.")
+
+ for x in ScaleConst.COMMUNICATION_NAMES:
+ if x in self.api_name:
+ params.perturbed_result=(params.original_result)/ScaleLayer.var
+ ScaleLayer.var = 1.0
+ return params.perturbed_result
+
+ if(self.is_nonvalid(params.args)):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
+ params.perturbed_result = params.original_result
+ return params.perturbed_result
+
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ new_args = params.args
+ else:
+ new_args = self.tensor_scale(params.args)
+
+ params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
+
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)
+ try:
+ new_args1 = params.perturbed_result,*new_args[1:]
+ params.perturbed_result = params.origin_func(*new_args1,
+ **params.kwargs)
+ except KeyError as e:
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+
+ return params.perturbed_result
+
+
+ def _scale(self, inputs):
+ self.scale_factor = (ScaleConst.SQRT_UB
+ / torch.maximum(ScaleConst.SQRT_UB,
+ torch.norm(inputs,p=1,dim=-1).max()))
+ scaled_inputs = inputs * self.scale_factor
+ ScaleLayer.var *= self.scale_factor
+ if ScaleLayer.var < ScaleConst.FP16_EPS:
+ ScaleLayer.var = ScaleConst.FP16_EPS
+ return scaled_inputs
+
+ def _unscale(self, output):
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / ScaleLayer.var).detach()
+ ScaleLayer.var = 1.0
+ return unscaled_outputs
+
\ No newline at end of file
--
Gitee
From 06c0ba77c42a0bff3d4ae8e7175e8449d2875a82 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 18:01:43 +0800
Subject: [PATCH 03/15] add free-benchmark auto-fix; delete scale
---
.../docs/15.free_benchmarking_PyTorch.md | 7 +-
.../pytorch/free_benchmark/common/enums.py | 8 +-
.../msprobe/pytorch/free_benchmark/main.py | 1 +
.../perturbed_layers/layer_factory.py | 2 -
.../perturbed_layers/npu/scale.py | 132 ------------------
5 files changed, 8 insertions(+), 142 deletions(-)
delete mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
index a2bc2112c16..af3b9c9fef2 100644
--- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
@@ -20,7 +20,7 @@
2. **扰动因子**:基于torch.nn.Module的hook机制,在注册的hook函数中对算子输入进行特定类型扰动。
3. **误差分析**:
* **check**: 在hook函数中二次执行算子得到扰动后的算子输出,计算扰动后输出与原始输出的相对误差,查看是否符合精度标准;
- * **fix**: 需要做验证时,可以选择将特定扰动类型(升精度,to cpu)的输出替换原始输出,观察对模型Loss是否有影响。
+ * **fix**: 需要做验证时,可以选择将特定扰动类型(升精度,to cpu)的输出替换原始输出,观察对模型Loss是否有影响;需要恢复算子时,可以选择自动恢复,工具将自动执行——检测前向中Nan/inf/全0问题,然后基于缩放->切高精度->Synchronize->Contiguous->引导tocpu的顺序进行排查替换。
4. **精度风险算子**:不达标精度标准的,最终会在输出件中展示

@@ -96,12 +96,13 @@ D-->config.json配置
参数 | 是否必选 | 可配置项 | 适用场景 |
- pert_mode | 否 | "improve_precision" (默认) | (常用)(可做验证) 插桩算子可能在低精度下有精度问题,扰动因子会将输入的低精度向量升精度。 |
+ pert_mode | 否 | "improve_precision" (默认) | (常用)(可做验证) 插桩算子可能在低精度下有精度问题,扰动因子会将输入的低精度向量升精度。 |
"bit_noise" | (常用)插桩算子可能在轻微扰动下暴露精度问题,扰动因子会将输入向量最后一个比特位翻转。 |
"add_noise" | 插桩算子可能在轻微扰动下暴露精度问题,扰动因子会为输入向量增加一个极小。 |
"change_value" | 插桩算子可能存在大数吃小数问题,扰动因子会交换输入向量的首尾。 |
"no_change" | 插桩算子可能存在数值稳定性精度问题,扰动因子会复制原始输。 |
"to_cpu" | (可做验证) 插桩算子可能在同 CPU 精度表现不一致,扰动因子会将输入转至 CPU,需要配合 fuzz_device="cpu"使用。 |
+ "auto_fix" | (专做修复) 已有怀疑算子,实现自动恢复,检测前向中Nan/inf/全0问题,按照缩放->切高精度->Synchronize->Contiguous->引导tocpu的顺序进行排查替换,快速恢复。 |
fuzz_device | 否 | "npu" (默认) | pert_mode 不需要to cpu操作。 |
"cpu" | pert_mode 须配置为"to_cpu",目前仅支持"to cpu"扰动因子。 |
@@ -111,7 +112,7 @@ D-->config.json配置
参数 | 是否必选 | 可配置项 | 适用场景 |
handler_type | 否 | "check"(默认) | 要做精度问题算子排查,输出扰动前后不符合精度标准的算子,支持所有扰动因子。 |
- "fix" | 要做可疑算子验证,用扰动后输出替换原始输出,支持"improve_precision","to_cpu"两种扰动因子。 |
+ "fix" | 要做可疑算子验证,用扰动后输出替换原始输出,支持"improve_precision","to_cpu"两种扰动因子;要做快速修复,用扰动后输出替换原始输出,支持"auto_fix"。 |
### 3.3 在模型脚本中开启工具
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
index cb1654683f9..930aeff21a1 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
@@ -8,8 +8,7 @@ class PerturbationMode:
NO_CHANGE = "no_change"
BIT_NOISE = "bit_noise"
TO_CPU = "to_cpu"
- AUTO = "auto_fix"
- SCALE = "scale"
+ AUTO = "auto_fix"#!guo
class DeviceType:
@@ -50,8 +49,7 @@ class PytorchFreeBenchmarkConst:
PerturbationMode.NO_CHANGE,
PerturbationMode.BIT_NOISE,
PerturbationMode.TO_CPU,
- PerturbationMode.AUTO,
- PerturbationMode.SCALE
+ PerturbationMode.AUTO,#!guo
]
DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION
DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU]
@@ -61,7 +59,7 @@ class PytorchFreeBenchmarkConst:
FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL]
DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL
FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD]
- FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO, PerturbationMode.SCALE]
+ FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]#!guo
DEFAULT_FUZZ_STAGE = Const.FORWARD
DEFAULT_PREHEAT_STEP = 15
DEFAULT_MAX_SAMPLE = 20
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
index 1d3d6fdfe37..439a4cdf5c8 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
@@ -88,6 +88,7 @@ class FreeBenchmarkCheck(ABC):
layer.handle(data_params)
handler_params = make_handler_params(name, self.config, self.current_iter)
handler = FuzzHandlerFactory.create(handler_params)
+ #!guo
if handler_params.pert_mode == PerturbationMode.AUTO:
perturbed_output = handler.handle(data_params, handler_params.pert_mode)
return perturbed_output, handler.get_unequal_rows()
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
index dcbd6739fc8..15005ecd394 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
@@ -26,7 +26,6 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.improve_precision impor
from msprobe.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer
from msprobe.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer
from msprobe.pytorch.free_benchmark.perturbed_layers.npu.auto_fix import AutoLayer
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.scale import ScaleLayer
class LayerFactory:
layers = {
@@ -37,7 +36,6 @@ class LayerFactory:
PerturbationMode.BIT_NOISE: BitNoiseLayer,
PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer,
PerturbationMode.AUTO: AutoLayer,
- PerturbationMode.SCALE: ScaleLayer,
},
DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer},
}
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
deleted file mode 100644
index beaaa6c34bf..00000000000
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!guo @Rodin
-import torch
-import torch_npu
-
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-from msprobe.core.common.const import Const
-from msprobe.pytorch.free_benchmark import logger
-from msprobe.pytorch.free_benchmark.common.constant import CommonField
-from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
-from msprobe.pytorch.free_benchmark.common.params import DataParams
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
- NpuBaseLayer,
-)
-
-class ScaleConst:
- """
- Class for ScaleLayer's const
- """
- SOFTMAX_NAME = "softmax"
- LINEAR_NAME = "linear"
- MATMUL_NAME = "matmul"
-
- FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
- FP16_UB = torch.finfo(torch.float16).max
-
- import numpy as np
- SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
-
- COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
-
-
-class ScaleLayer(NpuBaseLayer):
- var = 1.0
-
- def is_nonvalid(self, tensor_obj):
- if isinstance(tensor_obj, torch.Tensor):
- if not torch.isfinite(tensor_obj).all():
- return True
- return False
- if isinstance(tensor_obj, dict):
- return any(self.is_nonvalid(value) for value in tensor_obj.values())
- if isinstance(tensor_obj, (tuple, list)):
- return any(self.is_nonvalid(value) for value in tensor_obj)
- return False
-
- def tensor_scale(self, tensor_obj,unscale=False):
- if isinstance(tensor_obj, torch.Tensor):
- if(unscale):
- tensor_obj = self._unscale(tensor_obj)
- else:
- tensor_obj = self._scale(tensor_obj)
- self.is_added = True
- return tensor_obj
- if isinstance(tensor_obj, dict):
- return {
- key: self.tensor_scale(value)
- for key, value in tensor_obj.items()
- }
- if isinstance(tensor_obj, (tuple, list)):
- return type(tensor_obj)(
- [self.tensor_scale(value) for value in tensor_obj]
- )
- return tensor_obj
-
- def handle(self, params: DataParams):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Perturbation is "
- f"{PerturbationMode.SCALE} of {self.api_name}.")
-
- for x in ScaleConst.COMMUNICATION_NAMES:
- if x in self.api_name:
- params.perturbed_result=(params.original_result)/ScaleLayer.var
- ScaleLayer.var = 1.0
- return params.perturbed_result
-
- if(self.is_nonvalid(params.args)):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
- params.perturbed_result = params.original_result
- return params.perturbed_result
-
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- new_args = params.args
- else:
- new_args = self.tensor_scale(params.args)
-
- params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
-
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)
- try:
- new_args1 = params.perturbed_result,*new_args[1:]
- params.perturbed_result = params.origin_func(*new_args1,
- **params.kwargs)
- except KeyError as e:
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
-
- return params.perturbed_result
-
-
- def _scale(self, inputs):
- self.scale_factor = (ScaleConst.SQRT_UB
- / torch.maximum(ScaleConst.SQRT_UB,
- torch.norm(inputs,p=1,dim=-1).max()))
- scaled_inputs = inputs * self.scale_factor
- ScaleLayer.var *= self.scale_factor
- if ScaleLayer.var < ScaleConst.FP16_EPS:
- ScaleLayer.var = ScaleConst.FP16_EPS
- return scaled_inputs
-
- def _unscale(self, output):
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / ScaleLayer.var).detach()
- ScaleLayer.var = 1.0
- return unscaled_outputs
-
\ No newline at end of file
--
Gitee
From 66e6e9618b2a32665b5409144bfa5fb27fbafe6a Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 18:07:38 +0800
Subject: [PATCH 04/15] add free-benchmark auto-fix; delete scale
---
.../msprobe/docs/15.free_benchmarking_PyTorch.md | 3 ++-
.../msprobe/pytorch/free_benchmark/common/enums.py | 6 +++---
.../msprobe/pytorch/free_benchmark/common/utils.py | 2 +-
debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py | 1 -
4 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
index af3b9c9fef2..6a590f4b093 100644
--- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
@@ -1,7 +1,7 @@
# PyTorch 场景的无标杆比对
## 1 简介
-* 本工具的目标是在不依赖标杆数据的情况下,检测模型训练中可能存在的精度问题API级别算子,并提供升精度和tocpu接口快速验证。
+* 本工具的目标是在不依赖标杆数据的情况下,检测模型训练中可能存在的精度问题API级别算子,并提供升精度和tocpu接口快速验证,以及针对算子的快速恢复。
* 工具基于**数值病态分析理论**:对算子的输入增加很小的扰动,从而放大输出值异常现象;检测算子原始输出和扰动后输出间误差是否符合精度标准。
* 该工具的**特点**有:
@@ -10,6 +10,7 @@
* 推荐使用场景(针对**算子精度问题**):
* **暂无标杆数据**,模型Loss异常,要做精度问题算子排查;
* **验证可疑算子**,要做进一步确认,验证是否对模型Loss有影响;
+ * **可疑算子快速恢复**,使用scale、切精度、同步等方法快速排除和恢复算子问题;
* 低精度模型效果不如高精度,要做精度问题算子排查。
* 该工具的约束
* 仅支持Pytorch2.x场景;
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
index 930aeff21a1..dac78f016a0 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
@@ -8,7 +8,7 @@ class PerturbationMode:
NO_CHANGE = "no_change"
BIT_NOISE = "bit_noise"
TO_CPU = "to_cpu"
- AUTO = "auto_fix"#!guo
+ AUTO = "auto_fix"
class DeviceType:
@@ -49,7 +49,7 @@ class PytorchFreeBenchmarkConst:
PerturbationMode.NO_CHANGE,
PerturbationMode.BIT_NOISE,
PerturbationMode.TO_CPU,
- PerturbationMode.AUTO,#!guo
+ PerturbationMode.AUTO,
]
DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION
DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU]
@@ -59,7 +59,7 @@ class PytorchFreeBenchmarkConst:
FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL]
DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL
FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD]
- FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]#!guo
+ FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]
DEFAULT_FUZZ_STAGE = Const.FORWARD
DEFAULT_PREHEAT_STEP = 15
DEFAULT_MAX_SAMPLE = 20
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index 4a46ac3a91f..51f3b143443 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -78,7 +78,7 @@ class Tools:
if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
if pert_mode == PerturbationMode.AUTO:
origin.data = perturbed.to(origin.device)
- return origin #!guo
+ return origin
origin.data = perturbed.to(origin.dtype).to(origin.device)
return origin
if isinstance(origin, dict) and isinstance(perturbed, dict):
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
index 439a4cdf5c8..1d3d6fdfe37 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
@@ -88,7 +88,6 @@ class FreeBenchmarkCheck(ABC):
layer.handle(data_params)
handler_params = make_handler_params(name, self.config, self.current_iter)
handler = FuzzHandlerFactory.create(handler_params)
- #!guo
if handler_params.pert_mode == PerturbationMode.AUTO:
perturbed_output = handler.handle(data_params, handler_params.pert_mode)
return perturbed_output, handler.get_unequal_rows()
--
Gitee
From f3cb1f0d5edae7c420132e7b8e6d16315c2760be Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 18:12:20 +0800
Subject: [PATCH 05/15] add free-benchmark auto-fix; delete scale
---
.../perturbed_layers/npu/auto_fix.py | 62 +++--
tmp.py | 258 ++++++++++++++++++
2 files changed, 303 insertions(+), 17 deletions(-)
create mode 100644 tmp.py
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 810bd33eb7f..1807323caa6 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -113,11 +113,16 @@ class AutoLayer(NpuBaseLayer):
return tensor_obj
def handle(self, params: DataParams):
- # Try Scale
+ if not self.check_catastrophe(params.original_result):
+ params.perturbed_result = params.original_result
+ return params.perturbed_result
+
+ #! Try Scale
logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: An Problem shows here.\n"
f"[msprobe] Free benchmark: Perturbation is "
f"{PerturbationMode.AUTO} of {self.api_name}."
- f"Trying Scale for this"
+ f"Trying Scale for this."
)
for x in ScaleConst.COMMUNICATION_NAMES:
if x in self.api_name:
@@ -145,12 +150,19 @@ class AutoLayer(NpuBaseLayer):
except KeyError as e:
logger.info_on_rank_0(
f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
- # Try Improve Precision
+ #! Try improve precision
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Auto Scaler is Useless "
- f"Trying to improve precision for"
+ f"[msprobe] Free benchmark: 'Scaler' is Useless. "
+ f"Trying to improve precision for "
f"{PerturbationMode.AUTO} of {self.api_name}."
)
new_args = self.improve_tensor_precision(params.args)
@@ -158,41 +170,57 @@ class AutoLayer(NpuBaseLayer):
new_kwargs = {}
else:
new_kwargs = self.improve_tensor_precision(params.kwargs)
- # 如果输入中全为高精度、应跳过二次执行、减少多余显存引用
- if not self.is_added:
- return params.perturbed_result
if "inplace" in new_kwargs:
new_kwargs["inplace"] = False
params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
- # Try Synchronize
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
+
+ #! Try Synchronize
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Auto Scaler is Useless "
- f"Trying Synchronize for"
+ f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
+ f"Trying Synchronize for "
f"{PerturbationMode.AUTO} of {self.api_name}."
)
torch_npu.npu.synchronize()
params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
torch_npu.npu.synchronize()
+
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
- # Try Contiguous
+ #! Try Contiguous
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Synchronize is Useless, too "
+ f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. "
f"Trying 'Contiguous' for"
f"{PerturbationMode.AUTO} of {self.api_name}."
)
new_args = self.tensor_contiguous(params.args)
new_kwargs = self.tensor_contiguous(params.kwargs)
- if not self.is_added:
- return params.perturbed_result
params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
- # Hint to 'tocpu'
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
+
+ #! Hint to 'tocpu'
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Contiguous is Useless, too "
+ f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
f"Please set pert_mode to 'tocpu' for further check."
)
return params.original_result
diff --git a/tmp.py b/tmp.py
new file mode 100644
index 00000000000..1102f8cfeb6
--- /dev/null
+++ b/tmp.py
@@ -0,0 +1,258 @@
+# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from msprobe.core.common.const import Const
+from msprobe.pytorch.free_benchmark import logger
+from msprobe.pytorch.free_benchmark.common.constant import CommonField
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
+from msprobe.pytorch.free_benchmark.common.params import DataParams
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+ NpuBaseLayer,
+)
+
+class ScaleConst:
+ """
+ Class for ScaleLayer's const
+ """
+ SOFTMAX_NAME = "softmax"
+ LINEAR_NAME = "linear"
+ MATMUL_NAME = "matmul"
+
+ FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
+ FP16_UB = torch.finfo(torch.float16).max
+
+ import numpy as np
+ SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+
+ COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
+
+class AutoLayer(NpuBaseLayer):
+ scale_var = 1.0
+
+ def check_catastrophe(self, tensor_obj):
+ if isinstance(tensor_obj, torch.Tensor):
+ if torch.all(tensor_obj.eq(0)):
+ return True
+ if torch.isinf(tensor_obj).any():
+ return True
+ if torch.isnan(tensor_obj).any():
+ return True
+ return False
+ if isinstance(tensor_obj, dict):
+ return any(self.check_catastrophe(value) for value in tensor_obj.values())
+ if isinstance(tensor_obj, (tuple, list)):
+ return any(self.check_catastrophe(value) for value in tensor_obj)
+ return False
+
+ def tensor_scale(self, tensor_obj,unscale=False):
+ if isinstance(tensor_obj, torch.Tensor):
+ if(unscale):
+ tensor_obj = self._unscale(tensor_obj)
+ else:
+ tensor_obj = self._scale(tensor_obj)
+ self.is_added = True
+ return tensor_obj
+ if isinstance(tensor_obj, dict):
+ return {
+ key: self.tensor_scale(value)
+ for key, value in tensor_obj.items()
+ }
+ if isinstance(tensor_obj, (tuple, list)):
+ return type(tensor_obj)(
+ [self.tensor_scale(value) for value in tensor_obj]
+ )
+ return tensor_obj
+
+ def tensor_contiguous(self, tensor_obj):
+ if isinstance(tensor_obj, torch.Tensor):
+ return tensor_obj.contiguous()
+ if isinstance(tensor_obj, dict):
+ return {
+ key: self.tensor_contiguous(value)
+ for key, value in tensor_obj.items()
+ }
+ if isinstance(tensor_obj, (tuple, list)):
+ return type(tensor_obj)(
+ [self.tensor_contiguous(value) for value in tensor_obj]
+ )
+ return tensor_obj
+
+ def improve_tensor_precision(self, tensor_obj):
+ if (
+ isinstance(tensor_obj, torch.Tensor)
+ and torch.is_floating_point(tensor_obj)
+ and tensor_obj.dtype not in [torch.float32, torch.float64]
+ ):
+ self._set_improve_values(tensor_obj)
+ tensor_obj = self._change_dtype(tensor_obj)
+ self.is_added = True
+ return tensor_obj
+ if isinstance(tensor_obj, dict):
+ return {
+ key: self.improve_tensor_precision(value)
+ for key, value in tensor_obj.items()
+ }
+ if isinstance(tensor_obj, (tuple, list)):
+ return type(tensor_obj)(
+ [self.improve_tensor_precision(value) for value in tensor_obj]
+ )
+ return tensor_obj
+
+ def handle(self, params: DataParams):
+ if not self.check_catastrophe(params.original_result):
+ params.perturbed_result = params.original_result
+ return params.perturbed_result
+
+ #! Try Scale
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: An Problem shows here.\n"
+ f"[msprobe] Free benchmark: Perturbation is "
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ f"Trying Scale for this."
+ )
+ for x in ScaleConst.COMMUNICATION_NAMES:
+ if x in self.api_name:
+ params.perturbed_result=(params.original_result)/AutoLayer.scale_var
+ AutoLayer.scale_var = 1.0
+
+ if(self.check_catastrophe(params.args)):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
+ params.perturbed_result = params.original_result
+
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ new_args = params.args
+ else:
+ new_args = self.tensor_scale(params.args)
+
+ params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
+
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)
+ try:
+ new_args1 = params.perturbed_result,*new_args[1:]
+ params.perturbed_result = params.origin_func(*new_args1,
+ **params.kwargs)
+ except KeyError as e:
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
+
+ #! Try improve precision
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: 'Scaler' is Useless. "
+ f"Trying to improve precision for "
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ )
+ new_args = self.improve_tensor_precision(params.args)
+ if params.fuzz_stage == Const.BACKWARD:
+ new_kwargs = {}
+ else:
+ new_kwargs = self.improve_tensor_precision(params.kwargs)
+ if "inplace" in new_kwargs:
+ new_kwargs["inplace"] = False
+ params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
+
+ #! Try Synchronize
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
+ f"Trying Synchronize for "
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ )
+ torch_npu.npu.synchronize()
+ params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
+ torch_npu.npu.synchronize()
+
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
+
+ #! Try Contiguous
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. "
+ f"Trying 'Contiguous' for"
+ f"{PerturbationMode.AUTO} of {self.api_name}."
+ )
+ new_args = self.tensor_contiguous(params.args)
+ new_kwargs = self.tensor_contiguous(params.kwargs)
+ params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+
+ if not self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
+
+ #! Hint to 'tocpu'
+ if self.check_catastrophe(params.perturbed_result):
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
+ f"Please set pert_mode to 'tocpu' for further check."
+ )
+ return params.original_result
+
+ def _scale(self, inputs):
+ self.scale_factor = (ScaleConst.SQRT_UB
+ / torch.maximum(ScaleConst.SQRT_UB,
+ torch.norm(inputs,p=1,dim=-1).max()))
+ scaled_inputs = inputs * self.scale_factor
+ AutoLayer.scale_var *= self.scale_factor
+ if AutoLayer.scale_var < ScaleConst.FP16_EPS:
+ AutoLayer.scale_var = ScaleConst.FP16_EPS
+ return scaled_inputs
+
+ def _unscale(self, output):
+ if(ScaleConst.SOFTMAX_NAME in self.api_name):
+ unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
+ AutoLayer.scale_var = 1.0
+ return unscaled_outputs
+ def _set_improve_values(self, inputs):
+ if inputs.dtype in [torch.float16, torch.bfloat16]:
+ self.perturbed_value = torch.float32
+
+ def _change_dtype(self, inputs):
+ if hasattr(inputs, CommonField.DEVICE):
+ device = inputs.device
+ if device is CommonField.META:
+ new_inputs = inputs.to(
+ device=CommonField.META, dtype=self.perturbed_value
+ )
+ else:
+ new_inputs = inputs.to(dtype=self.perturbed_value).to(device)
+ else:
+ new_inputs = inputs.to(dtype=self.perturbed_value)
+ return new_inputs
\ No newline at end of file
--
Gitee
From 168c5abf2bbf6c7a280b7fcd40c3b66e81b521d5 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Mon, 25 Nov 2024 09:59:08 +0800
Subject: [PATCH 06/15] update doc02,15
---
debug/accuracy_tools/msprobe/docs/02.config_introduction.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index f9bcf3476a8..bd07f611059 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -106,8 +106,8 @@ PyTorch 与 MindSpore 动态图场景下,"level"须为"L1";MindSpore 静态
PyTorch 场景:指定某一类 API,对某一类的 API 进行无标杆比对。 配置示例:"list": ["relu"]。 |
MindSpore 场景:指定 API 名称,对列表中的 API 进行检测。 配置示例:"list": ["mindspore.mint.div", "mindspore.ops.bmm", "mindspore.Tensor.__add__"]。 |
fuzz_device | 标杆设备,str 类型。可选参数: "npu":无标杆,通过添加扰动因子进行比对,默认值; "cpu":以 CPU 为标杆,pert_mode 须配置为"to_cpu"(仅 PyTorch 场景支持)。 配置示例:"fuzz_device": "npu"。 | 否 |
- pert_mode | 无标杆扰动因子,str 类型。可选参数: "improve_precision":对输入做升精度,默认值; "add_noise":对输入增加噪声; "no_change":不加扰动直接二次执行; "bit_noise":输入的末位比特翻转,MindSpore 场景不支持 BF16 类型的向量; "change_value":输入的张量首尾值调换; "to_cpu":在 CPU 等价执行(仅 PyTorch 场景支持)。 配置示例:"pert_mode": "improve_precision"。 | 否 |
- handler_type | 处理类型,可选参数: "check":进行无标杆比对检查,默认值; "fix":将扰动后的 API 输出结果覆盖原始 API 输出结果,尝试将 Loss 曲线恢复正常,该模式下不支持预热功能与反向过程,且仅支持"improve_precision"、"to_cpu"( PyTorch 场景)两种扰动因子。 配置示例:"handler_type": "check"。 | 否 |
+ pert_mode | 无标杆扰动因子,str 类型。可选参数: "improve_precision":对输入做升精度,默认值; "add_noise":对输入增加噪声; "no_change":不加扰动直接二次执行; "bit_noise":输入的末位比特翻转,MindSpore 场景不支持 BF16 类型的向量; "change_value":输入的张量首尾值调换; "to_cpu":在 CPU 等价执行(仅 PyTorch 场景支持)。 "auto_fix":使用scale、切精度、同步等方法快速排除和恢复算子问题。 配置示例:"pert_mode": "improve_precision"。 | 否 |
+ handler_type | 处理类型,可选参数: "check":进行无标杆比对检查,默认值; "fix":将扰动后的 API 输出结果覆盖原始 API 输出结果,尝试将 Loss 曲线恢复正常,该模式下不支持预热功能与反向过程,且仅支持"improve_precision"、"to_cpu"( PyTorch 场景)、"auto_fix"( PyTorch 场景)三种扰动因子。 配置示例:"handler_type": "check"。 | 否 |
fuzz_level | 无标杆数据 dump 级别,即选择比对结果文件应输出的表头属性,当前仅支持取值为:"L1"。输出结果详见 1.6.1 无标杆比对数据存盘格式。 | 否 |
fuzz_stage | 比对过程,选择对 API 前向或反向进行无标杆比对,可选参数: "forward":前向,默认值; "backward":反向, 仅 PyTorch 场景支持。当 fuzz_stage 为 "backward" 时,handler_type 只能为 "check"。 配置示例:"fuzz_stage": "backward"。 | 否 |
if_preheat | 预热功能(仅 PyTorch 场景支持),bool 类型。开启功能后工具可以根据每次迭代的输出调整精度算法的阈值,从而更准确地找出存在精度问题的 API。当"handler_type": "fix"时,不支持预热。可选参数: true(开启)或 false(关闭),默认关闭。 配置示例:"if_preheat": "true"。 | 否 |
--
Gitee
From a7465de08b6120ebc80fa2094c2d96b527fe9579 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Mon, 25 Nov 2024 13:05:17 +0800
Subject: [PATCH 07/15] update auto_fix-tocpu
---
.../msprobe/pytorch/free_benchmark/common/utils.py | 2 +-
.../free_benchmark/perturbed_layers/npu/auto_fix.py | 10 +++++++++-
.../free_benchmark/result_handlers/fix_handler.py | 6 +++---
3 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index 51f3b143443..ac00bd13f6e 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -77,7 +77,7 @@ class Tools:
def convert_fuzz_output_to_origin(origin, perturbed, pert_mode):
if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
if pert_mode == PerturbationMode.AUTO:
- origin.data = perturbed.to(origin.device)
+ origin.data = perturbed
return origin
origin.data = perturbed.to(origin.dtype).to(origin.device)
return origin
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 1807323caa6..8f4ebda2141 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -23,6 +23,8 @@ from msprobe.pytorch.free_benchmark.common.params import DataParams
from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
NpuBaseLayer,
)
+from msprobe.pytorch.free_benchmark.common.utils import Tools
+from msprobe.pytorch.free_benchmark.common.enums import DeviceType
class ScaleConst:
"""
@@ -221,8 +223,14 @@ class AutoLayer(NpuBaseLayer):
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
- f"Please set pert_mode to 'tocpu' for further check."
+ f"Trying 'To_cpu' for"
+ f"{PerturbationMode.AUTO} of {self.api_name}."
)
+ new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True)
+ new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True)
+ params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+ return params.perturbed_result
+
return params.original_result
def _scale(self, inputs):
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py
index d0b918402dd..b70b0b6bce6 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py
@@ -20,17 +20,17 @@ from msprobe.pytorch.free_benchmark import logger
from msprobe.pytorch.free_benchmark.common.params import DataParams
from msprobe.pytorch.free_benchmark.common.utils import Tools
from msprobe.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler
-
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
class FixHandler(FuzzHandler):
def get_threshold(self, dtype):
return self._get_default_threshold(dtype)
- def handle(self, data_params: DataParams) -> Any:
+ def handle(self, data_params: DataParams, pert_mode: PerturbationMode = None) -> Any:
try:
return Tools.convert_fuzz_output_to_origin(
- data_params.original_result, data_params.perturbed_result
+ data_params.original_result, data_params.perturbed_result, pert_mode
)
except FreeBenchmarkException as e:
logger.warning(
--
Gitee
From f12cbeee8adebf5e559e85fc36b8ace61d431a27 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Mon, 25 Nov 2024 20:40:55 +0800
Subject: [PATCH 08/15] Update scale
---
.../perturbed_layers/npu/auto_fix.py | 117 +++++++++---------
1 file changed, 56 insertions(+), 61 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 8f4ebda2141..3a7b367627a 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -25,6 +25,7 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import
)
from msprobe.pytorch.free_benchmark.common.utils import Tools
from msprobe.pytorch.free_benchmark.common.enums import DeviceType
+from typing import Any, Callable, Dict, List, Optional, Tuple
class ScaleConst:
"""
@@ -43,8 +44,6 @@ class ScaleConst:
COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
class AutoLayer(NpuBaseLayer):
- scale_var = 1.0
-
def check_catastrophe(self, tensor_obj):
if isinstance(tensor_obj, torch.Tensor):
if torch.all(tensor_obj.eq(0)):
@@ -115,56 +114,50 @@ class AutoLayer(NpuBaseLayer):
return tensor_obj
def handle(self, params: DataParams):
- if not self.check_catastrophe(params.original_result):
- params.perturbed_result = params.original_result
+ self.scale_factor = 1.0
+ params.perturbed_result = params.original_result
+ if not self.check_catastrophe(params.perturbed_result):
return params.perturbed_result
-
- #! Try Scale
logger.info_on_rank_0(
- f"[msprobe] Free benchmark: An Problem shows here.\n"
- f"[msprobe] Free benchmark: Perturbation is "
- f"{PerturbationMode.AUTO} of {self.api_name}."
- f"Trying Scale for this."
- )
- for x in ScaleConst.COMMUNICATION_NAMES:
- if x in self.api_name:
- params.perturbed_result=(params.original_result)/AutoLayer.scale_var
- AutoLayer.scale_var = 1.0
-
- if(self.check_catastrophe(params.args)):
+ f"[msprobe] Free benchmark: An Problem shows here. "
+ )
+ #! Try Scale
+ if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME in self.api_name:
logger.info_on_rank_0(
- f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
- params.perturbed_result = params.original_result
-
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- new_args = params.args
- else:
+ f"[msprobe] Free benchmark: Perturbation is "
+ f"{PerturbationMode.AUTO} of {self.api_name}. "
+ f"Trying Scale for this."
+ )
new_args = self.tensor_scale(params.args)
-
- params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
-
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)
- try:
- new_args1 = params.perturbed_result,*new_args[1:]
- params.perturbed_result = params.origin_func(*new_args1,
- **params.kwargs)
- except KeyError as e:
+ params.perturbed_result = params.origin_func(
+ *new_args, **params.kwargs)
+
+ if (ScaleConst.SOFTMAX_NAME in self.api_name):
+ params.perturbed_result = self.tensor_scale(
+ params.perturbed_result, unscale=True)
+ try:
+ new_args1 = params.perturbed_result, *new_args[1:]
+ params.perturbed_result = params.origin_func(*new_args1,
+ **params.kwargs)
+ except KeyError as e:
+ logger.info_on_rank_0(
+ f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+ else:
+ params.perturbed_result = self.tensor_scale(
+ params.perturbed_result, unscale=True)
+
+ if not self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
-
- if not self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
- f"Problem solved."
- )
- return params.perturbed_result
+ f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
+ f"Problem solved."
+ )
+ return params.perturbed_result
#! Try improve precision
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
f"[msprobe] Free benchmark: 'Scaler' is Useless. "
- f"Trying to improve precision for "
+ f"Trying 'improve precision' for "
f"{PerturbationMode.AUTO} of {self.api_name}."
)
new_args = self.improve_tensor_precision(params.args)
@@ -187,7 +180,7 @@ class AutoLayer(NpuBaseLayer):
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
- f"Trying Synchronize for "
+ f"Trying 'Synchronize' for "
f"{PerturbationMode.AUTO} of {self.api_name}."
)
torch_npu.npu.synchronize()
@@ -223,31 +216,33 @@ class AutoLayer(NpuBaseLayer):
if self.check_catastrophe(params.perturbed_result):
logger.info_on_rank_0(
f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
- f"Trying 'To_cpu' for"
- f"{PerturbationMode.AUTO} of {self.api_name}."
+ f"Please set pert_mode to 'To_cpu' for further check."
)
- new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True)
- new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True)
- params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
- return params.perturbed_result
-
return params.original_result
+ def _get_scale_factor(self, inputs):
+ upper = ScaleConst.SQRT_UB / 2
+ lower = torch.maximum(ScaleConst.SQRT_UB,
+ torch.norm(inputs, p=1, dim=-1).max())
+ return upper / lower
+
def _scale(self, inputs):
- self.scale_factor = (ScaleConst.SQRT_UB
- / torch.maximum(ScaleConst.SQRT_UB,
- torch.norm(inputs,p=1,dim=-1).max()))
- scaled_inputs = inputs * self.scale_factor
- AutoLayer.scale_var *= self.scale_factor
- if AutoLayer.scale_var < ScaleConst.FP16_EPS:
- AutoLayer.scale_var = ScaleConst.FP16_EPS
+ cur_scale = self._get_scale_factor(inputs)
+ self.scale_factor = max(ScaleConst.FP16_EPS,
+ cur_scale * self.scale_factor)
+ scaled_inputs = inputs * cur_scale
return scaled_inputs
def _unscale(self, output):
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
- AutoLayer.scale_var = 1.0
- return unscaled_outputs
+ if (ScaleConst.SOFTMAX_NAME in self.api_name):
+ unscaled_outputs = (
+ torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
+ else:
+ rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
+ torch.norm(output, p=1, dim=-1).max())
+ unscaled_outputs = rescale_coeff * output
+ return unscaled_outputs
+
def _set_improve_values(self, inputs):
if inputs.dtype in [torch.float16, torch.bfloat16]:
self.perturbed_value = torch.float32
--
Gitee
From faf46ff3d18d3920e521719662204c7ddcca8082 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 11:53:44 +0800
Subject: [PATCH 09/15] bmm adding
---
.../pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 3a7b367627a..b5b4ff78e6e 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -34,6 +34,7 @@ class ScaleConst:
SOFTMAX_NAME = "softmax"
LINEAR_NAME = "linear"
MATMUL_NAME = "matmul"
+ BMM_NAME = "bmm"
FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
FP16_UB = torch.finfo(torch.float16).max
@@ -122,7 +123,7 @@ class AutoLayer(NpuBaseLayer):
f"[msprobe] Free benchmark: An Problem shows here. "
)
#! Try Scale
- if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME in self.api_name:
+ if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name:
logger.info_on_rank_0(
f"[msprobe] Free benchmark: Perturbation is "
f"{PerturbationMode.AUTO} of {self.api_name}. "
--
Gitee
From edfa2d8e0de72014042b53f8a12fb4fae9915bfe Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 12:11:22 +0800
Subject: [PATCH 10/15] bmm adding
---
.../perturbed_layers/npu/auto_fix.py | 27 ++++++++++---------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index b5b4ff78e6e..73ea09c2abc 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -26,7 +26,7 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import
from msprobe.pytorch.free_benchmark.common.utils import Tools
from msprobe.pytorch.free_benchmark.common.enums import DeviceType
from typing import Any, Callable, Dict, List, Optional, Tuple
-
+import numpy as np
class ScaleConst:
"""
Class for ScaleLayer's const
@@ -39,8 +39,8 @@ class ScaleConst:
FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
FP16_UB = torch.finfo(torch.float16).max
- import numpy as np
- SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+
+ SQRT_UB = np.sqrt(FP16_UB)
COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
@@ -222,10 +222,12 @@ class AutoLayer(NpuBaseLayer):
return params.original_result
def _get_scale_factor(self, inputs):
- upper = ScaleConst.SQRT_UB / 2
- lower = torch.maximum(ScaleConst.SQRT_UB,
- torch.norm(inputs, p=1, dim=-1).max())
- return upper / lower
+ nominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
+ max_denominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
+ denominator = torch.maximum(max_denominator, torch.norm(inputs, p=1, dim=-1).max())
+ # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB)
+ min_scale_factor = torch.tensor([1 / ScaleConst.SQRT_UB], dtype=torch.float16).npu()
+ return torch.maximum(nominator / denominator, min_scale_factor)
def _scale(self, inputs):
cur_scale = self._get_scale_factor(inputs)
@@ -236,13 +238,12 @@ class AutoLayer(NpuBaseLayer):
def _unscale(self, output):
if (ScaleConst.SOFTMAX_NAME in self.api_name):
- unscaled_outputs = (
- torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
+ unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
+
else:
- rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
- torch.norm(output, p=1, dim=-1).max())
- unscaled_outputs = rescale_coeff * output
- return unscaled_outputs
+ rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
+ unscaled_outputs = output / rescale_factor
+ unscaled_outputs = torch.nan_to_num(unscaled_outputs, posinf=np.sqrt(ScaleConst.FP16_UB))
def _set_improve_values(self, inputs):
if inputs.dtype in [torch.float16, torch.bfloat16]:
--
Gitee
From 17579b085e5d743e607df9067a6e32bc99c441ef Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 14:32:48 +0800
Subject: [PATCH 11/15] update 4 OP
---
.../perturbed_layers/npu/auto_fix.py | 26 +++++++++----------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 73ea09c2abc..a5c1cd6d163 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -40,7 +40,7 @@ class ScaleConst:
FP16_UB = torch.finfo(torch.float16).max
- SQRT_UB = np.sqrt(FP16_UB)
+ SQRT_UB = torch.tensor([np.sqrt(FP16_UB)], dtype=torch.float16).npu()
COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
@@ -122,6 +122,7 @@ class AutoLayer(NpuBaseLayer):
logger.info_on_rank_0(
f"[msprobe] Free benchmark: An Problem shows here. "
)
+
#! Try Scale
if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name:
logger.info_on_rank_0(
@@ -222,12 +223,10 @@ class AutoLayer(NpuBaseLayer):
return params.original_result
def _get_scale_factor(self, inputs):
- nominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
- max_denominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
- denominator = torch.maximum(max_denominator, torch.norm(inputs, p=1, dim=-1).max())
- # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB)
- min_scale_factor = torch.tensor([1 / ScaleConst.SQRT_UB], dtype=torch.float16).npu()
- return torch.maximum(nominator / denominator, min_scale_factor)
+ upper = ScaleConst.SQRT_UB / 2
+ lower = torch.maximum(ScaleConst.SQRT_UB,
+ torch.norm(inputs, p=1, dim=-1).max())
+ return upper / lower
def _scale(self, inputs):
cur_scale = self._get_scale_factor(inputs)
@@ -235,15 +234,16 @@ class AutoLayer(NpuBaseLayer):
cur_scale * self.scale_factor)
scaled_inputs = inputs * cur_scale
return scaled_inputs
-
+
def _unscale(self, output):
if (ScaleConst.SOFTMAX_NAME in self.api_name):
- unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
-
+ unscaled_outputs = (
+ torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
else:
- rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
- unscaled_outputs = output / rescale_factor
- unscaled_outputs = torch.nan_to_num(unscaled_outputs, posinf=np.sqrt(ScaleConst.FP16_UB))
+ rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
+ torch.norm(output, p=1, dim=-1).max())
+ unscaled_outputs = rescale_coeff * output
+ return unscaled_outputs
def _set_improve_values(self, inputs):
if inputs.dtype in [torch.float16, torch.bfloat16]:
--
Gitee
From 61b6f056e98cd89ad6ff988300811047c2d0b336 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 14:38:52 +0800
Subject: [PATCH 12/15] update readme
---
debug/accuracy_tools/msprobe/docs/02.config_introduction.md | 2 +-
.../accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index bd07f611059..a5d9e27062f 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -103,7 +103,7 @@ PyTorch 与 MindSpore 动态图场景下,"level"须为"L1";MindSpore 静态
参数 | 解释 | 是否必选 |
scope | 自定义检测 API 列表(仅 PyTorch 场景支持),list[str] 类型,默认值为空列表,当 list 也为空列表时,表示检测所有 API。需要在 [ ] 内配置具体 API 名(在 dump 的结果中查看)。与 list 参数不能同时配置。 配置示例:"scope": ["Torch.matmul.0.forward", "Tensor.pow.4.forward"]。 | 否 |
list | 自定义检测 API 类型或 API 名称,list[str] 类型,默认值为空列表,表示检测所有 API(PyTorch 场景下还需 scope 也为空列表)。与 scope 参数不能同时配置。 | 否 |
- PyTorch 场景:指定某一类 API,对某一类的 API 进行无标杆比对。 配置示例:"list": ["relu"]。 |
+ PyTorch 场景:指定某一类 API,对某一类的 API 进行无标杆比对。 配置示例:"list": ["relu"]。针对任务auto_fix, 须知其中scale功能支持matmul,bmm,softmax,linear,其他算子将跳过scale使用其他修复方法。 |
MindSpore 场景:指定 API 名称,对列表中的 API 进行检测。 配置示例:"list": ["mindspore.mint.div", "mindspore.ops.bmm", "mindspore.Tensor.__add__"]。 |
fuzz_device | 标杆设备,str 类型。可选参数: "npu":无标杆,通过添加扰动因子进行比对,默认值; "cpu":以 CPU 为标杆,pert_mode 须配置为"to_cpu"(仅 PyTorch 场景支持)。 配置示例:"fuzz_device": "npu"。 | 否 |
pert_mode | 无标杆扰动因子,str 类型。可选参数: "improve_precision":对输入做升精度,默认值; "add_noise":对输入增加噪声; "no_change":不加扰动直接二次执行; "bit_noise":输入的末位比特翻转,MindSpore 场景不支持 BF16 类型的向量; "change_value":输入的张量首尾值调换; "to_cpu":在 CPU 等价执行(仅 PyTorch 场景支持)。 "auto_fix":使用scale、切精度、同步等方法快速排除和恢复算子问题。 配置示例:"pert_mode": "improve_precision"。 | 否 |
diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
index 6a590f4b093..bd345813400 100644
--- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
@@ -88,7 +88,7 @@ D-->config.json配置
参数 | 是否必选 | 可配置项 | 适用场景 |
scope | 否 | 自定义 | 需要通过指定算子名来限制算子插桩范围 如:["Torch.matmul.0.forward", "Tensor.pow.4.forward"]。 |
- list | 否 | 自定义 | 需要通过指定算子类型来限制算子插桩范围 如:["relu"] 会匹配所有算子名中包含relu的算子。 |
+ list | 否 | 自定义 | 需要通过指定算子类型来限制算子插桩范围 如:["relu"] 会匹配所有算子名中包含relu的算子。针对任务auto_fix, 须知其中scale功能支持matmul,bmm,softmax,linear,其他算子将跳过scale使用其他修复方法。 |
fuzz_stage | 否 | "forward"(默认) | 需要进行算子前向计算的精度问题排查或验证可疑算子。 |
"backward" | 需要进行算子反向计算的精度问题排查,不支持仅反向验证,前向验证包括反向。 | |
--
Gitee
From 2f8c3111a160ee6076c5c69e5fd1c9afa732bf1c Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Thu, 28 Nov 2024 10:44:25 +0800
Subject: [PATCH 13/15] backward scale fix support
---
.../pytorch/free_benchmark/common/utils.py | 2 +-
.../perturbed_layers/npu/auto_fix.py | 25 +++++++++++--------
2 files changed, 16 insertions(+), 11 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index ac00bd13f6e..3dbded07e94 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -77,7 +77,7 @@ class Tools:
def convert_fuzz_output_to_origin(origin, perturbed, pert_mode):
if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
if pert_mode == PerturbationMode.AUTO:
- origin.data = perturbed
+ origin = perturbed
return origin
origin.data = perturbed.to(origin.dtype).to(origin.device)
return origin
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index a5c1cd6d163..a210e2aaf69 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -41,6 +41,7 @@ class ScaleConst:
SQRT_UB = torch.tensor([np.sqrt(FP16_UB)], dtype=torch.float16).npu()
+ SQRT_UB_INV = torch.tensor([1 / np.sqrt(FP16_UB)], dtype=torch.float16).npu()
COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
@@ -221,12 +222,17 @@ class AutoLayer(NpuBaseLayer):
f"Please set pert_mode to 'To_cpu' for further check."
)
return params.original_result
-
+
def _get_scale_factor(self, inputs):
- upper = ScaleConst.SQRT_UB / 2
- lower = torch.maximum(ScaleConst.SQRT_UB,
- torch.norm(inputs, p=1, dim=-1).max())
- return upper / lower
+ nominator = ScaleConst.SQRT_UB
+ x_norm = torch.norm(inputs, p=1, dim=-1).max()
+ if(torch.isfinite(x_norm).all()):
+ denominator = torch.maximum(ScaleConst.SQRT_UB, x_norm)
+ else:
+ # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB)
+ return ScaleConst.SQRT_UB_INV.to(torch.get_device(inputs))
+ computed_scale = (nominator / denominator).to(torch.get_device(inputs))
+ return computed_scale
def _scale(self, inputs):
cur_scale = self._get_scale_factor(inputs)
@@ -237,12 +243,11 @@ class AutoLayer(NpuBaseLayer):
def _unscale(self, output):
if (ScaleConst.SOFTMAX_NAME in self.api_name):
- unscaled_outputs = (
- torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
+ unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
else:
- rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
- torch.norm(output, p=1, dim=-1).max())
- unscaled_outputs = rescale_coeff * output
+ rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
+ unscaled_outputs = output / rescale_factor
+
return unscaled_outputs
def _set_improve_values(self, inputs):
--
Gitee
From 870867b0459c06d46e4e0f997a2715aa92e6df26 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 24 Dec 2024 11:53:14 +0800
Subject: [PATCH 14/15] check-cla
---
.../perturbed_layers/npu/auto_fix.py | 28 ++++++++++++-------
1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index a210e2aaf69..4941af49719 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -15,18 +15,19 @@
import torch
import torch_npu
+import numpy as np
+
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import NpuBaseLayer
+from msprobe.pytorch.free_benchmark.common.utils import Tools
+from msprobe.pytorch.free_benchmark.common.enums import DeviceType
from msprobe.core.common.const import Const
from msprobe.pytorch.free_benchmark import logger
from msprobe.pytorch.free_benchmark.common.constant import CommonField
from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
from msprobe.pytorch.free_benchmark.common.params import DataParams
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
- NpuBaseLayer,
-)
-from msprobe.pytorch.free_benchmark.common.utils import Tools
-from msprobe.pytorch.free_benchmark.common.enums import DeviceType
-from typing import Any, Callable, Dict, List, Optional, Tuple
-import numpy as np
+
class ScaleConst:
"""
Class for ScaleLayer's const
@@ -116,6 +117,13 @@ class AutoLayer(NpuBaseLayer):
return tensor_obj
def handle(self, params: DataParams):
+ is_scale_applicable = (
+ ScaleConst.SOFTMAX_NAME in self.api_name or
+ ScaleConst.LINEAR_NAME in self.api_name or
+ ScaleConst.MATMUL_NAME in self.api_name or
+ ScaleConst.BMM_NAME in self.api_name
+ )
+
self.scale_factor = 1.0
params.perturbed_result = params.original_result
if not self.check_catastrophe(params.perturbed_result):
@@ -125,7 +133,7 @@ class AutoLayer(NpuBaseLayer):
)
#! Try Scale
- if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name:
+ if is_scale_applicable:
logger.info_on_rank_0(
f"[msprobe] Free benchmark: Perturbation is "
f"{PerturbationMode.AUTO} of {self.api_name}. "
@@ -244,11 +252,11 @@ class AutoLayer(NpuBaseLayer):
def _unscale(self, output):
if (ScaleConst.SOFTMAX_NAME in self.api_name):
unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
+ return unscaled_outputs
else:
rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
unscaled_outputs = output / rescale_factor
-
- return unscaled_outputs
+ return unscaled_outputs
def _set_improve_values(self, inputs):
if inputs.dtype in [torch.float16, torch.bfloat16]:
--
Gitee
From 67949d7af3ade77b1c0d4bfff508b9cd549cab3c Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 24 Dec 2024 14:28:07 +0800
Subject: [PATCH 15/15] 12/24
---
tmp.py | 258 ---------------------------------------------------------
1 file changed, 258 deletions(-)
delete mode 100644 tmp.py
diff --git a/tmp.py b/tmp.py
deleted file mode 100644
index 1102f8cfeb6..00000000000
--- a/tmp.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch_npu
-from msprobe.core.common.const import Const
-from msprobe.pytorch.free_benchmark import logger
-from msprobe.pytorch.free_benchmark.common.constant import CommonField
-from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
-from msprobe.pytorch.free_benchmark.common.params import DataParams
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
- NpuBaseLayer,
-)
-
-class ScaleConst:
- """
- Class for ScaleLayer's const
- """
- SOFTMAX_NAME = "softmax"
- LINEAR_NAME = "linear"
- MATMUL_NAME = "matmul"
-
- FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
- FP16_UB = torch.finfo(torch.float16).max
-
- import numpy as np
- SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
-
- COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
-
-class AutoLayer(NpuBaseLayer):
- scale_var = 1.0
-
- def check_catastrophe(self, tensor_obj):
- if isinstance(tensor_obj, torch.Tensor):
- if torch.all(tensor_obj.eq(0)):
- return True
- if torch.isinf(tensor_obj).any():
- return True
- if torch.isnan(tensor_obj).any():
- return True
- return False
- if isinstance(tensor_obj, dict):
- return any(self.check_catastrophe(value) for value in tensor_obj.values())
- if isinstance(tensor_obj, (tuple, list)):
- return any(self.check_catastrophe(value) for value in tensor_obj)
- return False
-
- def tensor_scale(self, tensor_obj,unscale=False):
- if isinstance(tensor_obj, torch.Tensor):
- if(unscale):
- tensor_obj = self._unscale(tensor_obj)
- else:
- tensor_obj = self._scale(tensor_obj)
- self.is_added = True
- return tensor_obj
- if isinstance(tensor_obj, dict):
- return {
- key: self.tensor_scale(value)
- for key, value in tensor_obj.items()
- }
- if isinstance(tensor_obj, (tuple, list)):
- return type(tensor_obj)(
- [self.tensor_scale(value) for value in tensor_obj]
- )
- return tensor_obj
-
- def tensor_contiguous(self, tensor_obj):
- if isinstance(tensor_obj, torch.Tensor):
- return tensor_obj.contiguous()
- if isinstance(tensor_obj, dict):
- return {
- key: self.tensor_contiguous(value)
- for key, value in tensor_obj.items()
- }
- if isinstance(tensor_obj, (tuple, list)):
- return type(tensor_obj)(
- [self.tensor_contiguous(value) for value in tensor_obj]
- )
- return tensor_obj
-
- def improve_tensor_precision(self, tensor_obj):
- if (
- isinstance(tensor_obj, torch.Tensor)
- and torch.is_floating_point(tensor_obj)
- and tensor_obj.dtype not in [torch.float32, torch.float64]
- ):
- self._set_improve_values(tensor_obj)
- tensor_obj = self._change_dtype(tensor_obj)
- self.is_added = True
- return tensor_obj
- if isinstance(tensor_obj, dict):
- return {
- key: self.improve_tensor_precision(value)
- for key, value in tensor_obj.items()
- }
- if isinstance(tensor_obj, (tuple, list)):
- return type(tensor_obj)(
- [self.improve_tensor_precision(value) for value in tensor_obj]
- )
- return tensor_obj
-
- def handle(self, params: DataParams):
- if not self.check_catastrophe(params.original_result):
- params.perturbed_result = params.original_result
- return params.perturbed_result
-
- #! Try Scale
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: An Problem shows here.\n"
- f"[msprobe] Free benchmark: Perturbation is "
- f"{PerturbationMode.AUTO} of {self.api_name}."
- f"Trying Scale for this."
- )
- for x in ScaleConst.COMMUNICATION_NAMES:
- if x in self.api_name:
- params.perturbed_result=(params.original_result)/AutoLayer.scale_var
- AutoLayer.scale_var = 1.0
-
- if(self.check_catastrophe(params.args)):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
- params.perturbed_result = params.original_result
-
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- new_args = params.args
- else:
- new_args = self.tensor_scale(params.args)
-
- params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
-
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)
- try:
- new_args1 = params.perturbed_result,*new_args[1:]
- params.perturbed_result = params.origin_func(*new_args1,
- **params.kwargs)
- except KeyError as e:
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
-
- if not self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
- f"Problem solved."
- )
- return params.perturbed_result
-
- #! Try improve precision
- if self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: 'Scaler' is Useless. "
- f"Trying to improve precision for "
- f"{PerturbationMode.AUTO} of {self.api_name}."
- )
- new_args = self.improve_tensor_precision(params.args)
- if params.fuzz_stage == Const.BACKWARD:
- new_kwargs = {}
- else:
- new_kwargs = self.improve_tensor_precision(params.kwargs)
- if "inplace" in new_kwargs:
- new_kwargs["inplace"] = False
- params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
-
- if not self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, "
- f"Problem solved."
- )
- return params.perturbed_result
-
- #! Try Synchronize
- if self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
- f"Trying Synchronize for "
- f"{PerturbationMode.AUTO} of {self.api_name}."
- )
- torch_npu.npu.synchronize()
- params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
- torch_npu.npu.synchronize()
-
- if not self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, "
- f"Problem solved."
- )
- return params.perturbed_result
-
- #! Try Contiguous
- if self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. "
- f"Trying 'Contiguous' for"
- f"{PerturbationMode.AUTO} of {self.api_name}."
- )
- new_args = self.tensor_contiguous(params.args)
- new_kwargs = self.tensor_contiguous(params.kwargs)
- params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
-
- if not self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, "
- f"Problem solved."
- )
- return params.perturbed_result
-
- #! Hint to 'tocpu'
- if self.check_catastrophe(params.perturbed_result):
- logger.info_on_rank_0(
- f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
- f"Please set pert_mode to 'tocpu' for further check."
- )
- return params.original_result
-
- def _scale(self, inputs):
- self.scale_factor = (ScaleConst.SQRT_UB
- / torch.maximum(ScaleConst.SQRT_UB,
- torch.norm(inputs,p=1,dim=-1).max()))
- scaled_inputs = inputs * self.scale_factor
- AutoLayer.scale_var *= self.scale_factor
- if AutoLayer.scale_var < ScaleConst.FP16_EPS:
- AutoLayer.scale_var = ScaleConst.FP16_EPS
- return scaled_inputs
-
- def _unscale(self, output):
- if(ScaleConst.SOFTMAX_NAME in self.api_name):
- unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
- AutoLayer.scale_var = 1.0
- return unscaled_outputs
- def _set_improve_values(self, inputs):
- if inputs.dtype in [torch.float16, torch.bfloat16]:
- self.perturbed_value = torch.float32
-
- def _change_dtype(self, inputs):
- if hasattr(inputs, CommonField.DEVICE):
- device = inputs.device
- if device is CommonField.META:
- new_inputs = inputs.to(
- device=CommonField.META, dtype=self.perturbed_value
- )
- else:
- new_inputs = inputs.to(dtype=self.perturbed_value).to(device)
- else:
- new_inputs = inputs.to(dtype=self.perturbed_value)
- return new_inputs
\ No newline at end of file
--
Gitee