From b79e5bb98c488006a3075ee4c4f079e99ddc2333 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 11:33:20 +0800
Subject: [PATCH 01/15] test

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ea548a0bfca..623c205332c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🚨 重要通知
+# 🚨 重要通知:
 
 **1. Ascend Training Tools 更名为 MindStudio Training Tools (mstt)。**
 
-- 
Gitee


From e76ef748d55fd6d00339e2db25d67ab008fb249b Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 12:12:13 +0800
Subject: [PATCH 02/15] add free-benchmark auto-fix

---
 .../pytorch/free_benchmark/common/enums.py    |   6 +-
 .../pytorch/free_benchmark/common/utils.py    |   7 +-
 .../msprobe/pytorch/free_benchmark/main.py    |   3 +
 .../perturbed_layers/layer_factory.py         |   5 +-
 .../perturbed_layers/npu/auto_fix.py          | 230 ++++++++++++++++++
 .../perturbed_layers/npu/scale.py             | 132 ++++++++++
 6 files changed, 379 insertions(+), 4 deletions(-)
 create mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
 create mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
index 181631c624a..cb1654683f9 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
@@ -8,6 +8,8 @@ class PerturbationMode:
     NO_CHANGE = "no_change"
     BIT_NOISE = "bit_noise"
     TO_CPU = "to_cpu"
+    AUTO = "auto_fix"
+    SCALE = "scale"
 
 
 class DeviceType:
@@ -48,6 +50,8 @@ class PytorchFreeBenchmarkConst:
         PerturbationMode.NO_CHANGE,
         PerturbationMode.BIT_NOISE,
         PerturbationMode.TO_CPU,
+        PerturbationMode.AUTO,
+        PerturbationMode.SCALE
     ]
     DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION
     DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU]
@@ -57,7 +61,7 @@ class PytorchFreeBenchmarkConst:
     FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL]
     DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL
     FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD]
-    FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU]
+    FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO, PerturbationMode.SCALE]
     DEFAULT_FUZZ_STAGE = Const.FORWARD
     DEFAULT_PREHEAT_STEP = 15
     DEFAULT_MAX_SAMPLE = 20
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index 391c2ceaca0..4a46ac3a91f 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -16,7 +16,7 @@
 import torch
 from msprobe.core.common.exceptions import FreeBenchmarkException
 from msprobe.pytorch.free_benchmark.common.enums import DeviceType
-
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
 
 class Tools:
 
@@ -74,8 +74,11 @@ class Tools:
         return tensor_seq
 
     @staticmethod
-    def convert_fuzz_output_to_origin(origin, perturbed):
+    def convert_fuzz_output_to_origin(origin, perturbed, pert_mode):
         if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
+            if pert_mode == PerturbationMode.AUTO:
+                origin.data = perturbed.to(origin.device)
+                return origin #!guo
             origin.data = perturbed.to(origin.dtype).to(origin.device)
             return origin
         if isinstance(origin, dict) and isinstance(perturbed, dict):
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
index 66d7b7e1042..1d3d6fdfe37 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
@@ -88,6 +88,9 @@ class FreeBenchmarkCheck(ABC):
         layer.handle(data_params)
         handler_params = make_handler_params(name, self.config, self.current_iter)
         handler = FuzzHandlerFactory.create(handler_params)
+        if handler_params.pert_mode == PerturbationMode.AUTO:
+            perturbed_output = handler.handle(data_params, handler_params.pert_mode)
+            return perturbed_output, handler.get_unequal_rows()
         perturbed_output = handler.handle(data_params)
         return perturbed_output, handler.get_unequal_rows()
 
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
index 79256cd4063..dcbd6739fc8 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
@@ -25,7 +25,8 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.improve_precision impor
 )
 from msprobe.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer
 from msprobe.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer
-
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.auto_fix import AutoLayer
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.scale import ScaleLayer
 
 class LayerFactory:
     layers = {
@@ -35,6 +36,8 @@ class LayerFactory:
             PerturbationMode.NO_CHANGE: NoChangeLayer,
             PerturbationMode.BIT_NOISE: BitNoiseLayer,
             PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer,
+            PerturbationMode.AUTO: AutoLayer,
+            PerturbationMode.SCALE: ScaleLayer,
         },
         DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer},
     }
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
new file mode 100644
index 00000000000..810bd33eb7f
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -0,0 +1,230 @@
+# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from msprobe.core.common.const import Const
+from msprobe.pytorch.free_benchmark import logger
+from msprobe.pytorch.free_benchmark.common.constant import CommonField
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
+from msprobe.pytorch.free_benchmark.common.params import DataParams
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+class ScaleConst:
+    """
+    Class for ScaleLayer's const
+    """
+    SOFTMAX_NAME = "softmax"
+    LINEAR_NAME = "linear"
+    MATMUL_NAME = "matmul"
+
+    FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
+    FP16_UB = torch.finfo(torch.float16).max
+    
+    import numpy as np
+    SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+    
+    COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
+    
+class AutoLayer(NpuBaseLayer):
+    scale_var = 1.0
+    
+    def check_catastrophe(self, tensor_obj):
+        if isinstance(tensor_obj, torch.Tensor):
+            if torch.all(tensor_obj.eq(0)):
+                return True
+            if torch.isinf(tensor_obj).any():
+                return True
+            if torch.isnan(tensor_obj).any():
+                return True
+            return False
+        if isinstance(tensor_obj, dict):
+            return any(self.check_catastrophe(value) for value in tensor_obj.values())
+        if isinstance(tensor_obj, (tuple, list)):
+            return any(self.check_catastrophe(value) for value in tensor_obj)
+        return False
+
+    def tensor_scale(self, tensor_obj,unscale=False):
+        if isinstance(tensor_obj, torch.Tensor):
+            if(unscale):
+                tensor_obj = self._unscale(tensor_obj)
+            else:
+                tensor_obj = self._scale(tensor_obj)
+                self.is_added = True
+            return tensor_obj
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.tensor_scale(value) 
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.tensor_scale(value) for value in tensor_obj]
+            )
+        return tensor_obj
+    
+    def tensor_contiguous(self, tensor_obj):
+        if isinstance(tensor_obj, torch.Tensor):
+            return tensor_obj.contiguous()
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.tensor_contiguous(value) 
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.tensor_contiguous(value) for value in tensor_obj]
+            )
+        return tensor_obj 
+    
+    def improve_tensor_precision(self, tensor_obj):
+        if (
+            isinstance(tensor_obj, torch.Tensor)
+            and torch.is_floating_point(tensor_obj)
+            and tensor_obj.dtype not in [torch.float32, torch.float64]
+        ):
+            self._set_improve_values(tensor_obj)
+            tensor_obj = self._change_dtype(tensor_obj)
+            self.is_added = True
+            return tensor_obj
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.improve_tensor_precision(value)
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.improve_tensor_precision(value) for value in tensor_obj]
+            )
+        return tensor_obj
+    
+    def handle(self, params: DataParams):
+        # Try Scale
+        logger.info_on_rank_0(
+            f"[msprobe] Free benchmark: Perturbation is "
+            f"{PerturbationMode.AUTO} of {self.api_name}."
+            f"Trying Scale for this"
+        )
+        for x in ScaleConst.COMMUNICATION_NAMES:
+            if x in self.api_name:
+                params.perturbed_result=(params.original_result)/AutoLayer.scale_var
+                AutoLayer.scale_var = 1.0
+
+        if(self.check_catastrophe(params.args)):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
+            params.perturbed_result = params.original_result
+            
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            new_args = params.args
+        else:
+            new_args = self.tensor_scale(params.args)
+
+        params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
+
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)  
+            try:
+                new_args1 = params.perturbed_result,*new_args[1:]
+                params.perturbed_result = params.origin_func(*new_args1,
+                                                            **params.kwargs)
+            except KeyError as e:
+                logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+        
+        # Try Improve Precision
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Auto Scaler is Useless "
+                f"Trying to improve precision for"
+                f"{PerturbationMode.AUTO} of {self.api_name}."
+            )
+            new_args = self.improve_tensor_precision(params.args)
+            if params.fuzz_stage == Const.BACKWARD:
+                new_kwargs = {}
+            else:
+                new_kwargs = self.improve_tensor_precision(params.kwargs)
+            # 如果输入中全为高精度、应跳过二次执行、减少多余显存引用
+            if not self.is_added:
+                return params.perturbed_result
+            if "inplace" in new_kwargs:
+                new_kwargs["inplace"] = False
+            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+        
+        # Try Synchronize
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Auto Scaler is Useless "
+                f"Trying Synchronize for"
+                f"{PerturbationMode.AUTO} of {self.api_name}."
+            )
+            torch_npu.npu.synchronize()
+            params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
+            torch_npu.npu.synchronize()
+        
+        # Try Contiguous
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Synchronize is Useless, too "
+                f"Trying 'Contiguous' for"
+                f"{PerturbationMode.AUTO} of {self.api_name}."
+            )
+            new_args = self.tensor_contiguous(params.args)
+            new_kwargs = self.tensor_contiguous(params.kwargs)
+            if not self.is_added:
+                return params.perturbed_result
+            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+        
+        # Hint to 'tocpu'
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Contiguous is Useless, too "
+                f"Please set pert_mode to 'tocpu' for further check."
+            )
+        return params.original_result
+
+    def _scale(self, inputs):
+        self.scale_factor = (ScaleConst.SQRT_UB 
+                             / torch.maximum(ScaleConst.SQRT_UB,
+                                             torch.norm(inputs,p=1,dim=-1).max()))
+        scaled_inputs = inputs * self.scale_factor
+        AutoLayer.scale_var *= self.scale_factor
+        if AutoLayer.scale_var < ScaleConst.FP16_EPS:
+            AutoLayer.scale_var = ScaleConst.FP16_EPS
+        return scaled_inputs
+    
+    def _unscale(self, output):
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
+            AutoLayer.scale_var = 1.0
+            return unscaled_outputs
+    def _set_improve_values(self, inputs):
+        if inputs.dtype in [torch.float16, torch.bfloat16]:
+            self.perturbed_value = torch.float32
+
+    def _change_dtype(self, inputs):
+        if hasattr(inputs, CommonField.DEVICE):
+            device = inputs.device
+            if device is CommonField.META:
+                new_inputs = inputs.to(
+                    device=CommonField.META, dtype=self.perturbed_value
+                )
+            else:
+                new_inputs = inputs.to(dtype=self.perturbed_value).to(device)
+        else:
+            new_inputs = inputs.to(dtype=self.perturbed_value)
+        return new_inputs
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
new file mode 100644
index 00000000000..beaaa6c34bf
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
@@ -0,0 +1,132 @@
+# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!guo @Rodin
+import torch
+import torch_npu
+
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from msprobe.core.common.const import Const
+from msprobe.pytorch.free_benchmark import logger
+from msprobe.pytorch.free_benchmark.common.constant import CommonField
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
+from msprobe.pytorch.free_benchmark.common.params import DataParams
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+class ScaleConst:
+    """
+    Class for ScaleLayer's const
+    """
+    SOFTMAX_NAME = "softmax"
+    LINEAR_NAME = "linear"
+    MATMUL_NAME = "matmul"
+
+    FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
+    FP16_UB = torch.finfo(torch.float16).max
+    
+    import numpy as np
+    SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+    
+    COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
+
+
+class ScaleLayer(NpuBaseLayer):
+    var = 1.0
+    
+    def is_nonvalid(self, tensor_obj):
+        if isinstance(tensor_obj, torch.Tensor):
+            if not torch.isfinite(tensor_obj).all():
+                return True
+            return False
+        if isinstance(tensor_obj, dict):
+            return any(self.is_nonvalid(value) for value in tensor_obj.values())
+        if isinstance(tensor_obj, (tuple, list)):
+            return any(self.is_nonvalid(value) for value in tensor_obj)
+        return False
+    
+    def tensor_scale(self, tensor_obj,unscale=False):
+        if isinstance(tensor_obj, torch.Tensor):
+            if(unscale):
+                tensor_obj = self._unscale(tensor_obj)
+            else:
+                tensor_obj = self._scale(tensor_obj)
+                self.is_added = True
+            return tensor_obj
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.tensor_scale(value) 
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.tensor_scale(value) for value in tensor_obj]
+            )
+        return tensor_obj
+
+    def handle(self, params: DataParams):
+        logger.info_on_rank_0(
+            f"[msprobe] Free benchmark: Perturbation is "
+            f"{PerturbationMode.SCALE} of {self.api_name}.")
+
+        for x in ScaleConst.COMMUNICATION_NAMES:
+            if x in self.api_name:
+                params.perturbed_result=(params.original_result)/ScaleLayer.var
+                ScaleLayer.var = 1.0
+                return params.perturbed_result
+
+        if(self.is_nonvalid(params.args)):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
+            params.perturbed_result = params.original_result
+            return params.perturbed_result
+            
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            new_args = params.args
+        else:
+            new_args = self.tensor_scale(params.args)
+
+        params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
+
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)  
+            try:
+                new_args1 = params.perturbed_result,*new_args[1:]
+                params.perturbed_result = params.origin_func(*new_args1,
+                                                            **params.kwargs)
+            except KeyError as e:
+                logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+
+        return params.perturbed_result
+    
+            
+    def _scale(self, inputs):
+        self.scale_factor = (ScaleConst.SQRT_UB 
+                             / torch.maximum(ScaleConst.SQRT_UB,
+                                             torch.norm(inputs,p=1,dim=-1).max()))
+        scaled_inputs = inputs * self.scale_factor
+        ScaleLayer.var *= self.scale_factor
+        if ScaleLayer.var < ScaleConst.FP16_EPS:
+            ScaleLayer.var = ScaleConst.FP16_EPS
+        return scaled_inputs
+    
+    def _unscale(self, output):
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / ScaleLayer.var).detach()
+            ScaleLayer.var = 1.0
+            return unscaled_outputs
+        
\ No newline at end of file
-- 
Gitee


From 06c0ba77c42a0bff3d4ae8e7175e8449d2875a82 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 18:01:43 +0800
Subject: [PATCH 03/15] add free-benchmark auto-fix; delete scale

---
 .../docs/15.free_benchmarking_PyTorch.md      |   7 +-
 .../pytorch/free_benchmark/common/enums.py    |   8 +-
 .../msprobe/pytorch/free_benchmark/main.py    |   1 +
 .../perturbed_layers/layer_factory.py         |   2 -
 .../perturbed_layers/npu/scale.py             | 132 ------------------
 5 files changed, 8 insertions(+), 142 deletions(-)
 delete mode 100644 debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py

diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
index a2bc2112c16..af3b9c9fef2 100644
--- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
@@ -20,7 +20,7 @@
 2. **扰动因子**：基于torch.nn.Module的hook机制，在注册的hook函数中对算子输入进行特定类型扰动。
 3. **误差分析**：
     * **check**: 在hook函数中二次执行算子得到扰动后的算子输出，计算扰动后输出与原始输出的相对误差，查看是否符合精度标准；
-    * **fix**: 需要做验证时，可以选择将特定扰动类型（升精度，to cpu）的输出替换原始输出，观察对模型Loss是否有影响。
+    * **fix**: 需要做验证时，可以选择将特定扰动类型（升精度，to cpu）的输出替换原始输出，观察对模型Loss是否有影响；需要恢复算子时，可以选择自动恢复，工具将自动执行——检测前向中Nan/inf/全0问题，然后基于缩放->切高精度->Synchronize->Contiguous->引导tocpu的顺序进行排查替换。
 4. **精度风险算子**：不达标精度标准的，最终会在输出件中展示
 
 ![alt text](./img/free_benchmark_framework.png)
@@ -96,12 +96,13 @@ D-->config.json配置
 
 <table>
     <tr><th>参数</th><th>是否必选</th><th>可配置项</th><th>适用场景</th></tr>
-    <tr><td rowspan="6">pert_mode</td><td rowspan="6">否</td><td>"improve_precision" （默认）</td><td>（常用）(可做验证) 插桩算子可能在<b>低精度</b>下有精度问题，扰动因子会将输入的低精度向量升精度。</td></tr>
+    <tr><td rowspan="7">pert_mode</td><td rowspan="7">否</td><td>"improve_precision" （默认）</td><td>（常用）(可做验证) 插桩算子可能在<b>低精度</b>下有精度问题，扰动因子会将输入的低精度向量升精度。</td></tr>
     <tr><td>"bit_noise"</td><td>（常用）插桩算子可能在<b>轻微扰动</b>下暴露精度问题，扰动因子会将输入向量最后一个比特位翻转。</td></tr>
     <tr><td>"add_noise"</td><td>插桩算子可能在<b>轻微扰动</b>下暴露精度问题，扰动因子会为输入向量增加一个极小。</td></tr>
     <tr><td>"change_value"</td><td>插桩算子可能存在<b>大数吃小数</b>问题，扰动因子会交换输入向量的首尾。</td></tr>
     <tr><td>"no_change"</td><td>插桩算子可能存在<b>数值稳定性</b>精度问题，扰动因子会复制原始输。</td></tr>
     <tr><td>"to_cpu"</td><td>(可做验证) 插桩算子可能在<b>同 CPU </b>精度表现不一致，扰动因子会将输入转至 CPU，需要配合 fuzz_device="cpu"使用。</td></tr>
+    <tr><td>"auto_fix"</td><td>(专做修复) 已有怀疑算子，实现自动恢复，检测前向中Nan/inf/全0问题，按照缩放->切高精度->Synchronize->Contiguous->引导tocpu的顺序进行排查替换，快速恢复。</td></tr>
     <tr><td rowspan="2">fuzz_device</td><td rowspan="2">否</td><td>"npu" （默认）</td><td>pert_mode 不需要to cpu操作。</td></tr>
     <tr><td>"cpu"</td><td>pert_mode 须配置为"to_cpu"，目前仅支持"to cpu"扰动因子。</td></tr>
 </table>
@@ -111,7 +112,7 @@ D-->config.json配置
 <table>
     <tr><th>参数</th><th>是否必选</th><th>可配置项</th><th>适用场景</th></tr>
     <tr><td rowspan="2">handler_type</td><td rowspan="2">否</td><td>"check"（默认）</td><td>要做精度问题算子排查，输出扰动前后不符合精度标准的算子，支持所有扰动因子。</td></tr>
-    <tr><td>"fix"</td><td>要做可疑算子验证，用扰动后输出替换原始输出，支持"improve_precision"，"to_cpu"两种扰动因子。</td></tr>
+    <tr><td>"fix"</td><td>要做可疑算子验证，用扰动后输出替换原始输出，支持"improve_precision"，"to_cpu"两种扰动因子；要做快速修复，用扰动后输出替换原始输出，支持"auto_fix"。</td></tr>
 </table>
 
 ### 3.3 在模型脚本中开启工具
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
index cb1654683f9..930aeff21a1 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
@@ -8,8 +8,7 @@ class PerturbationMode:
     NO_CHANGE = "no_change"
     BIT_NOISE = "bit_noise"
     TO_CPU = "to_cpu"
-    AUTO = "auto_fix"
-    SCALE = "scale"
+    AUTO = "auto_fix"#!guo
 
 
 class DeviceType:
@@ -50,8 +49,7 @@ class PytorchFreeBenchmarkConst:
         PerturbationMode.NO_CHANGE,
         PerturbationMode.BIT_NOISE,
         PerturbationMode.TO_CPU,
-        PerturbationMode.AUTO,
-        PerturbationMode.SCALE
+        PerturbationMode.AUTO,#!guo
     ]
     DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION
     DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU]
@@ -61,7 +59,7 @@ class PytorchFreeBenchmarkConst:
     FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL]
     DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL
     FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD]
-    FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO, PerturbationMode.SCALE]
+    FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]#!guo
     DEFAULT_FUZZ_STAGE = Const.FORWARD
     DEFAULT_PREHEAT_STEP = 15
     DEFAULT_MAX_SAMPLE = 20
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
index 1d3d6fdfe37..439a4cdf5c8 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
@@ -88,6 +88,7 @@ class FreeBenchmarkCheck(ABC):
         layer.handle(data_params)
         handler_params = make_handler_params(name, self.config, self.current_iter)
         handler = FuzzHandlerFactory.create(handler_params)
+        #!guo
         if handler_params.pert_mode == PerturbationMode.AUTO:
             perturbed_output = handler.handle(data_params, handler_params.pert_mode)
             return perturbed_output, handler.get_unequal_rows()
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
index dcbd6739fc8..15005ecd394 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py
@@ -26,7 +26,6 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.improve_precision impor
 from msprobe.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer
 from msprobe.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer
 from msprobe.pytorch.free_benchmark.perturbed_layers.npu.auto_fix import AutoLayer
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.scale import ScaleLayer
 
 class LayerFactory:
     layers = {
@@ -37,7 +36,6 @@ class LayerFactory:
             PerturbationMode.BIT_NOISE: BitNoiseLayer,
             PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer,
             PerturbationMode.AUTO: AutoLayer,
-            PerturbationMode.SCALE: ScaleLayer,
         },
         DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer},
     }
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
deleted file mode 100644
index beaaa6c34bf..00000000000
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/scale.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!guo @Rodin
-import torch
-import torch_npu
-
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-from msprobe.core.common.const import Const
-from msprobe.pytorch.free_benchmark import logger
-from msprobe.pytorch.free_benchmark.common.constant import CommonField
-from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
-from msprobe.pytorch.free_benchmark.common.params import DataParams
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
-    NpuBaseLayer,
-)
-
-class ScaleConst:
-    """
-    Class for ScaleLayer's const
-    """
-    SOFTMAX_NAME = "softmax"
-    LINEAR_NAME = "linear"
-    MATMUL_NAME = "matmul"
-
-    FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
-    FP16_UB = torch.finfo(torch.float16).max
-    
-    import numpy as np
-    SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
-    
-    COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
-
-
-class ScaleLayer(NpuBaseLayer):
-    var = 1.0
-    
-    def is_nonvalid(self, tensor_obj):
-        if isinstance(tensor_obj, torch.Tensor):
-            if not torch.isfinite(tensor_obj).all():
-                return True
-            return False
-        if isinstance(tensor_obj, dict):
-            return any(self.is_nonvalid(value) for value in tensor_obj.values())
-        if isinstance(tensor_obj, (tuple, list)):
-            return any(self.is_nonvalid(value) for value in tensor_obj)
-        return False
-    
-    def tensor_scale(self, tensor_obj,unscale=False):
-        if isinstance(tensor_obj, torch.Tensor):
-            if(unscale):
-                tensor_obj = self._unscale(tensor_obj)
-            else:
-                tensor_obj = self._scale(tensor_obj)
-                self.is_added = True
-            return tensor_obj
-        if isinstance(tensor_obj, dict):
-            return {
-                key: self.tensor_scale(value) 
-                for key, value in tensor_obj.items()
-            }
-        if isinstance(tensor_obj, (tuple, list)):
-            return type(tensor_obj)(
-                [self.tensor_scale(value) for value in tensor_obj]
-            )
-        return tensor_obj
-
-    def handle(self, params: DataParams):
-        logger.info_on_rank_0(
-            f"[msprobe] Free benchmark: Perturbation is "
-            f"{PerturbationMode.SCALE} of {self.api_name}.")
-
-        for x in ScaleConst.COMMUNICATION_NAMES:
-            if x in self.api_name:
-                params.perturbed_result=(params.original_result)/ScaleLayer.var
-                ScaleLayer.var = 1.0
-                return params.perturbed_result
-
-        if(self.is_nonvalid(params.args)):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
-            params.perturbed_result = params.original_result
-            return params.perturbed_result
-            
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            new_args = params.args
-        else:
-            new_args = self.tensor_scale(params.args)
-
-        params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
-
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)  
-            try:
-                new_args1 = params.perturbed_result,*new_args[1:]
-                params.perturbed_result = params.origin_func(*new_args1,
-                                                            **params.kwargs)
-            except KeyError as e:
-                logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
-
-        return params.perturbed_result
-    
-            
-    def _scale(self, inputs):
-        self.scale_factor = (ScaleConst.SQRT_UB 
-                             / torch.maximum(ScaleConst.SQRT_UB,
-                                             torch.norm(inputs,p=1,dim=-1).max()))
-        scaled_inputs = inputs * self.scale_factor
-        ScaleLayer.var *= self.scale_factor
-        if ScaleLayer.var < ScaleConst.FP16_EPS:
-            ScaleLayer.var = ScaleConst.FP16_EPS
-        return scaled_inputs
-    
-    def _unscale(self, output):
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / ScaleLayer.var).detach()
-            ScaleLayer.var = 1.0
-            return unscaled_outputs
-        
\ No newline at end of file
-- 
Gitee


From 66e6e9618b2a32665b5409144bfa5fb27fbafe6a Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 18:07:38 +0800
Subject: [PATCH 04/15] add free-benchmark auto-fix; delete scale

---
 .../msprobe/docs/15.free_benchmarking_PyTorch.md            | 3 ++-
 .../msprobe/pytorch/free_benchmark/common/enums.py          | 6 +++---
 .../msprobe/pytorch/free_benchmark/common/utils.py          | 2 +-
 debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py | 1 -
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
index af3b9c9fef2..6a590f4b093 100644
--- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
@@ -1,7 +1,7 @@
 # PyTorch 场景的无标杆比对
 
 ## 1 简介
-* 本工具的目标是在不依赖标杆数据的情况下，检测模型训练中可能存在的精度问题API级别算子，并提供升精度和tocpu接口快速验证。
+* 本工具的目标是在不依赖标杆数据的情况下，检测模型训练中可能存在的精度问题API级别算子，并提供升精度和tocpu接口快速验证，以及针对算子的快速恢复。
 * 工具基于**数值病态分析理论**：对算子的输入增加很小的扰动，从而放大输出值异常现象；检测算子原始输出和扰动后输出间误差是否符合精度标准。
 
 * 该工具的**特点**有：
@@ -10,6 +10,7 @@
 * 推荐使用场景（针对**算子精度问题**）：
     * **暂无标杆数据**，模型Loss异常，要做精度问题算子排查；
     * **验证可疑算子**，要做进一步确认，验证是否对模型Loss有影响；
+    * **可疑算子快速恢复**，使用scale、切精度、同步等方法快速排除和恢复算子问题；
     * 低精度模型效果不如高精度，要做精度问题算子排查。
 * 该工具的约束
     * 仅支持Pytorch2.x场景；
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
index 930aeff21a1..dac78f016a0 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/enums.py
@@ -8,7 +8,7 @@ class PerturbationMode:
     NO_CHANGE = "no_change"
     BIT_NOISE = "bit_noise"
     TO_CPU = "to_cpu"
-    AUTO = "auto_fix"#!guo
+    AUTO = "auto_fix"
 
 
 class DeviceType:
@@ -49,7 +49,7 @@ class PytorchFreeBenchmarkConst:
         PerturbationMode.NO_CHANGE,
         PerturbationMode.BIT_NOISE,
         PerturbationMode.TO_CPU,
-        PerturbationMode.AUTO,#!guo
+        PerturbationMode.AUTO,
     ]
     DEFAULT_MODE = PerturbationMode.IMPROVE_PRECISION
     DEVICE_LIST = [DeviceType.NPU, DeviceType.CPU]
@@ -59,7 +59,7 @@ class PytorchFreeBenchmarkConst:
     FUZZ_LEVEL_LIST = [FuzzLevel.BASE_LEVEL]
     DEFAULT_FUZZ_LEVEL = FuzzLevel.BASE_LEVEL
     FUZZ_STAGE_LIST = [Const.FORWARD, Const.BACKWARD]
-    FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]#!guo
+    FIX_MODE_LIST = [PerturbationMode.IMPROVE_PRECISION, PerturbationMode.TO_CPU, PerturbationMode.AUTO]
     DEFAULT_FUZZ_STAGE = Const.FORWARD
     DEFAULT_PREHEAT_STEP = 15
     DEFAULT_MAX_SAMPLE = 20
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index 4a46ac3a91f..51f3b143443 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -78,7 +78,7 @@ class Tools:
         if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
             if pert_mode == PerturbationMode.AUTO:
                 origin.data = perturbed.to(origin.device)
-                return origin #!guo
+                return origin 
             origin.data = perturbed.to(origin.dtype).to(origin.device)
             return origin
         if isinstance(origin, dict) and isinstance(perturbed, dict):
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
index 439a4cdf5c8..1d3d6fdfe37 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/main.py
@@ -88,7 +88,6 @@ class FreeBenchmarkCheck(ABC):
         layer.handle(data_params)
         handler_params = make_handler_params(name, self.config, self.current_iter)
         handler = FuzzHandlerFactory.create(handler_params)
-        #!guo
         if handler_params.pert_mode == PerturbationMode.AUTO:
             perturbed_output = handler.handle(data_params, handler_params.pert_mode)
             return perturbed_output, handler.get_unequal_rows()
-- 
Gitee


From f3cb1f0d5edae7c420132e7b8e6d16315c2760be Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Fri, 22 Nov 2024 18:12:20 +0800
Subject: [PATCH 05/15] add free-benchmark auto-fix; delete scale

---
 .../perturbed_layers/npu/auto_fix.py          |  62 +++--
 tmp.py                                        | 258 ++++++++++++++++++
 2 files changed, 303 insertions(+), 17 deletions(-)
 create mode 100644 tmp.py

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 810bd33eb7f..1807323caa6 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -113,11 +113,16 @@ class AutoLayer(NpuBaseLayer):
         return tensor_obj
     
     def handle(self, params: DataParams):
-        # Try Scale
+        if not self.check_catastrophe(params.original_result):
+            params.perturbed_result = params.original_result
+            return params.perturbed_result
+        
+        #! Try Scale
         logger.info_on_rank_0(
+            f"[msprobe] Free benchmark: An Problem shows here.\n"
             f"[msprobe] Free benchmark: Perturbation is "
             f"{PerturbationMode.AUTO} of {self.api_name}."
-            f"Trying Scale for this"
+            f"Trying Scale for this."
         )
         for x in ScaleConst.COMMUNICATION_NAMES:
             if x in self.api_name:
@@ -145,12 +150,19 @@ class AutoLayer(NpuBaseLayer):
             except KeyError as e:
                 logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+                
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
         
-        # Try Improve Precision
+        #! Try improve precision
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Auto Scaler is Useless "
-                f"Trying to improve precision for"
+                f"[msprobe] Free benchmark: 'Scaler' is Useless. "
+                f"Trying to improve precision for "
                 f"{PerturbationMode.AUTO} of {self.api_name}."
             )
             new_args = self.improve_tensor_precision(params.args)
@@ -158,41 +170,57 @@ class AutoLayer(NpuBaseLayer):
                 new_kwargs = {}
             else:
                 new_kwargs = self.improve_tensor_precision(params.kwargs)
-            # 如果输入中全为高精度、应跳过二次执行、减少多余显存引用
-            if not self.is_added:
-                return params.perturbed_result
             if "inplace" in new_kwargs:
                 new_kwargs["inplace"] = False
             params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
         
-        # Try Synchronize
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
+        
+        #! Try Synchronize
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Auto Scaler is Useless "
-                f"Trying Synchronize for"
+                f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
+                f"Trying Synchronize for "
                 f"{PerturbationMode.AUTO} of {self.api_name}."
             )
             torch_npu.npu.synchronize()
             params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
             torch_npu.npu.synchronize()
+            
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
         
-        # Try Contiguous
+        #! Try Contiguous
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Synchronize is Useless, too "
+                f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. "
                 f"Trying 'Contiguous' for"
                 f"{PerturbationMode.AUTO} of {self.api_name}."
             )
             new_args = self.tensor_contiguous(params.args)
             new_kwargs = self.tensor_contiguous(params.kwargs)
-            if not self.is_added:
-                return params.perturbed_result
             params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
         
-        # Hint to 'tocpu'
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
+        
+        #! Hint to 'tocpu'
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Contiguous is Useless, too "
+                f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
                 f"Please set pert_mode to 'tocpu' for further check."
             )
         return params.original_result
diff --git a/tmp.py b/tmp.py
new file mode 100644
index 00000000000..1102f8cfeb6
--- /dev/null
+++ b/tmp.py
@@ -0,0 +1,258 @@
+# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from msprobe.core.common.const import Const
+from msprobe.pytorch.free_benchmark import logger
+from msprobe.pytorch.free_benchmark.common.constant import CommonField
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
+from msprobe.pytorch.free_benchmark.common.params import DataParams
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+class ScaleConst:
+    """
+    Class for ScaleLayer's const
+    """
+    SOFTMAX_NAME = "softmax"
+    LINEAR_NAME = "linear"
+    MATMUL_NAME = "matmul"
+
+    FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
+    FP16_UB = torch.finfo(torch.float16).max
+    
+    import numpy as np
+    SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+    
+    COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
+    
+class AutoLayer(NpuBaseLayer):
+    scale_var = 1.0
+    
+    def check_catastrophe(self, tensor_obj):
+        if isinstance(tensor_obj, torch.Tensor):
+            if torch.all(tensor_obj.eq(0)):
+                return True
+            if torch.isinf(tensor_obj).any():
+                return True
+            if torch.isnan(tensor_obj).any():
+                return True
+            return False
+        if isinstance(tensor_obj, dict):
+            return any(self.check_catastrophe(value) for value in tensor_obj.values())
+        if isinstance(tensor_obj, (tuple, list)):
+            return any(self.check_catastrophe(value) for value in tensor_obj)
+        return False
+
+    def tensor_scale(self, tensor_obj,unscale=False):
+        if isinstance(tensor_obj, torch.Tensor):
+            if(unscale):
+                tensor_obj = self._unscale(tensor_obj)
+            else:
+                tensor_obj = self._scale(tensor_obj)
+                self.is_added = True
+            return tensor_obj
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.tensor_scale(value) 
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.tensor_scale(value) for value in tensor_obj]
+            )
+        return tensor_obj
+    
+    def tensor_contiguous(self, tensor_obj):
+        if isinstance(tensor_obj, torch.Tensor):
+            return tensor_obj.contiguous()
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.tensor_contiguous(value) 
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.tensor_contiguous(value) for value in tensor_obj]
+            )
+        return tensor_obj 
+    
+    def improve_tensor_precision(self, tensor_obj):
+        if (
+            isinstance(tensor_obj, torch.Tensor)
+            and torch.is_floating_point(tensor_obj)
+            and tensor_obj.dtype not in [torch.float32, torch.float64]
+        ):
+            self._set_improve_values(tensor_obj)
+            tensor_obj = self._change_dtype(tensor_obj)
+            self.is_added = True
+            return tensor_obj
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.improve_tensor_precision(value)
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.improve_tensor_precision(value) for value in tensor_obj]
+            )
+        return tensor_obj
+    
+    def handle(self, params: DataParams):
+        if not self.check_catastrophe(params.original_result):
+            params.perturbed_result = params.original_result
+            return params.perturbed_result
+        
+        #! Try Scale
+        logger.info_on_rank_0(
+            f"[msprobe] Free benchmark: An Problem shows here.\n"
+            f"[msprobe] Free benchmark: Perturbation is "
+            f"{PerturbationMode.AUTO} of {self.api_name}."
+            f"Trying Scale for this."
+        )
+        for x in ScaleConst.COMMUNICATION_NAMES:
+            if x in self.api_name:
+                params.perturbed_result=(params.original_result)/AutoLayer.scale_var
+                AutoLayer.scale_var = 1.0
+
+        if(self.check_catastrophe(params.args)):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
+            params.perturbed_result = params.original_result
+            
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            new_args = params.args
+        else:
+            new_args = self.tensor_scale(params.args)
+
+        params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
+
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)  
+            try:
+                new_args1 = params.perturbed_result,*new_args[1:]
+                params.perturbed_result = params.origin_func(*new_args1,
+                                                            **params.kwargs)
+            except KeyError as e:
+                logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+                
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
+
+        #! Try improve precision
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: 'Scaler' is Useless. "
+                f"Trying to improve precision for "
+                f"{PerturbationMode.AUTO} of {self.api_name}."
+            )
+            new_args = self.improve_tensor_precision(params.args)
+            if params.fuzz_stage == Const.BACKWARD:
+                new_kwargs = {}
+            else:
+                new_kwargs = self.improve_tensor_precision(params.kwargs)
+            if "inplace" in new_kwargs:
+                new_kwargs["inplace"] = False
+            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+        
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
+        
+        #! Try Synchronize
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
+                f"Trying Synchronize for "
+                f"{PerturbationMode.AUTO} of {self.api_name}."
+            )
+            torch_npu.npu.synchronize()
+            params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
+            torch_npu.npu.synchronize()
+            
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
+        
+        #! Try Contiguous
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. "
+                f"Trying 'Contiguous' for"
+                f"{PerturbationMode.AUTO} of {self.api_name}."
+            )
+            new_args = self.tensor_contiguous(params.args)
+            new_kwargs = self.tensor_contiguous(params.kwargs)
+            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+        
+        if not self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, "
+                f"Problem solved."
+            )
+            return params.perturbed_result
+        
+        #! Hint to 'tocpu'
+        if self.check_catastrophe(params.perturbed_result):
+            logger.info_on_rank_0(
+                f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
+                f"Please set pert_mode to 'tocpu' for further check."
+            )
+        return params.original_result
+
+    def _scale(self, inputs):
+        self.scale_factor = (ScaleConst.SQRT_UB 
+                             / torch.maximum(ScaleConst.SQRT_UB,
+                                             torch.norm(inputs,p=1,dim=-1).max()))
+        scaled_inputs = inputs * self.scale_factor
+        AutoLayer.scale_var *= self.scale_factor
+        if AutoLayer.scale_var < ScaleConst.FP16_EPS:
+            AutoLayer.scale_var = ScaleConst.FP16_EPS
+        return scaled_inputs
+    
+    def _unscale(self, output):
+        if(ScaleConst.SOFTMAX_NAME in self.api_name):
+            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
+            AutoLayer.scale_var = 1.0
+            return unscaled_outputs
+    def _set_improve_values(self, inputs):
+        if inputs.dtype in [torch.float16, torch.bfloat16]:
+            self.perturbed_value = torch.float32
+
+    def _change_dtype(self, inputs):
+        if hasattr(inputs, CommonField.DEVICE):
+            device = inputs.device
+            if device is CommonField.META:
+                new_inputs = inputs.to(
+                    device=CommonField.META, dtype=self.perturbed_value
+                )
+            else:
+                new_inputs = inputs.to(dtype=self.perturbed_value).to(device)
+        else:
+            new_inputs = inputs.to(dtype=self.perturbed_value)
+        return new_inputs
\ No newline at end of file
-- 
Gitee


From 168c5abf2bbf6c7a280b7fcd40c3b66e81b521d5 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Mon, 25 Nov 2024 09:59:08 +0800
Subject: [PATCH 06/15] update doc02,15

---
 debug/accuracy_tools/msprobe/docs/02.config_introduction.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index f9bcf3476a8..bd07f611059 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -106,8 +106,8 @@ PyTorch 与 MindSpore 动态图场景下，"level"须为"L1"；MindSpore 静态
     <tr><td>PyTorch 场景：指定某一类 API，对某一类的 API 进行无标杆比对。<br/><b>配置示例</b>："list": ["relu"]。</td></tr>
     <tr><td>MindSpore 场景：指定 API 名称，对列表中的 API 进行检测。<br/><b>配置示例</b>："list": ["mindspore.mint.div", "mindspore.ops.bmm", "mindspore.Tensor.__add__"]。</td></tr>
     <tr><td>fuzz_device</td><td>标杆设备，str 类型。可选参数：<br/>        "npu"：无标杆，通过添加扰动因子进行比对，默认值；<br/>        "cpu"：以 CPU 为标杆，pert_mode 须配置为"to_cpu"（仅 PyTorch 场景支持）。<br/><b>配置示例</b>："fuzz_device": "npu"。</td><td>否</td></tr>
-    <tr><td>pert_mode</td><td>无标杆扰动因子，str 类型。可选参数：<br/>  "improve_precision"：对输入做升精度，默认值；<br/>        "add_noise"：对输入增加噪声；<br/>        "no_change"：不加扰动直接二次执行；<br/>        "bit_noise"：输入的末位比特翻转，MindSpore 场景不支持 BF16 类型的向量；<br/>        "change_value"：输入的张量首尾值调换；<br/>        "to_cpu"：在 CPU 等价执行（仅 PyTorch 场景支持）。<br/><b>配置示例</b>："pert_mode": "improve_precision"。</td><td>否</td></tr>
-    <tr><td>handler_type</td><td>处理类型，可选参数：<br/> "check"：进行无标杆比对检查，默认值；<br/> "fix"：将扰动后的 API 输出结果覆盖原始 API 输出结果，尝试将 Loss 曲线恢复正常，该模式下不支持预热功能与反向过程，且仅支持"improve_precision"、"to_cpu"（ PyTorch 场景）两种扰动因子。<br/> <b>配置示例</b>："handler_type": "check"。</td><td>否</td></tr>
+    <tr><td>pert_mode</td><td>无标杆扰动因子，str 类型。可选参数：<br/>  "improve_precision"：对输入做升精度，默认值；<br/>        "add_noise"：对输入增加噪声；<br/>        "no_change"：不加扰动直接二次执行；<br/>        "bit_noise"：输入的末位比特翻转，MindSpore 场景不支持 BF16 类型的向量；<br/>        "change_value"：输入的张量首尾值调换；<br/>        "to_cpu"：在 CPU 等价执行（仅 PyTorch 场景支持）。<br/>        "auto_fix"：使用scale、切精度、同步等方法快速排除和恢复算子问题。<br/><b>配置示例</b>："pert_mode": "improve_precision"。</td><td>否</td></tr>
+    <tr><td>handler_type</td><td>处理类型，可选参数：<br/> "check"：进行无标杆比对检查，默认值；<br/> "fix"：将扰动后的 API 输出结果覆盖原始 API 输出结果，尝试将 Loss 曲线恢复正常，该模式下不支持预热功能与反向过程，且仅支持"improve_precision"、"to_cpu"（ PyTorch 场景）、"auto_fix"（ PyTorch 场景）三种扰动因子。<br/> <b>配置示例</b>："handler_type": "check"。</td><td>否</td></tr>
     <tr><td>fuzz_level</td><td>无标杆数据 dump 级别，即选择比对结果文件应输出的表头属性，当前仅支持取值为："L1"。输出结果详见 <a href="#161-无标杆比对数据存盘格式">1.6.1 无标杆比对数据存盘格式</a>。</td><td>否</td></tr>
     <tr><td>fuzz_stage</td><td>比对过程，选择对 API 前向或反向进行无标杆比对，可选参数：<br/> "forward"：前向，默认值；<br/>  "backward"：反向, 仅 PyTorch 场景支持。当 fuzz_stage 为 "backward" 时，handler_type 只能为 "check"。<br/>  <b>配置示例</b>："fuzz_stage": "backward"。</td><td>否</td></tr>
     <tr><td>if_preheat</td><td>预热功能（仅 PyTorch 场景支持），bool 类型。开启功能后工具可以根据每次迭代的输出调整精度算法的阈值，从而更准确地找出存在精度问题的 API。当"handler_type": "fix"时，不支持预热。可选参数：<br/>  true（开启）或 false（关闭），默认关闭。<br/>  <b>配置示例</b>："if_preheat": "true"。</td><td>否</td></tr>
-- 
Gitee


From a7465de08b6120ebc80fa2094c2d96b527fe9579 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Mon, 25 Nov 2024 13:05:17 +0800
Subject: [PATCH 07/15] update auto_fix-tocpu

---
 .../msprobe/pytorch/free_benchmark/common/utils.py     |  2 +-
 .../free_benchmark/perturbed_layers/npu/auto_fix.py    | 10 +++++++++-
 .../free_benchmark/result_handlers/fix_handler.py      |  6 +++---
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index 51f3b143443..ac00bd13f6e 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -77,7 +77,7 @@ class Tools:
     def convert_fuzz_output_to_origin(origin, perturbed, pert_mode):
         if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
             if pert_mode == PerturbationMode.AUTO:
-                origin.data = perturbed.to(origin.device)
+                origin.data = perturbed
                 return origin 
             origin.data = perturbed.to(origin.dtype).to(origin.device)
             return origin
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 1807323caa6..8f4ebda2141 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -23,6 +23,8 @@ from msprobe.pytorch.free_benchmark.common.params import DataParams
 from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
     NpuBaseLayer,
 )
+from msprobe.pytorch.free_benchmark.common.utils import Tools
+from msprobe.pytorch.free_benchmark.common.enums import DeviceType
 
 class ScaleConst:
     """
@@ -221,8 +223,14 @@ class AutoLayer(NpuBaseLayer):
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
-                f"Please set pert_mode to 'tocpu' for further check."
+                f"Trying 'To_cpu' for"
+                f"{PerturbationMode.AUTO} of {self.api_name}."
             )
+            new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True)
+            new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True)
+            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+            return params.perturbed_result
+
         return params.original_result
 
     def _scale(self, inputs):
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py
index d0b918402dd..b70b0b6bce6 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py
@@ -20,17 +20,17 @@ from msprobe.pytorch.free_benchmark import logger
 from msprobe.pytorch.free_benchmark.common.params import DataParams
 from msprobe.pytorch.free_benchmark.common.utils import Tools
 from msprobe.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler
-
+from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
 
 class FixHandler(FuzzHandler):
 
     def get_threshold(self, dtype):
         return self._get_default_threshold(dtype)
 
-    def handle(self, data_params: DataParams) -> Any:
+    def handle(self, data_params: DataParams, pert_mode: PerturbationMode = None) -> Any:
         try:
             return Tools.convert_fuzz_output_to_origin(
-                data_params.original_result, data_params.perturbed_result
+                data_params.original_result, data_params.perturbed_result, pert_mode
             )
         except FreeBenchmarkException as e:
             logger.warning(
-- 
Gitee


From f12cbeee8adebf5e559e85fc36b8ace61d431a27 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Mon, 25 Nov 2024 20:40:55 +0800
Subject: [PATCH 08/15] Update scale

---
 .../perturbed_layers/npu/auto_fix.py          | 117 +++++++++---------
 1 file changed, 56 insertions(+), 61 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 8f4ebda2141..3a7b367627a 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -25,6 +25,7 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import
 )
 from msprobe.pytorch.free_benchmark.common.utils import Tools
 from msprobe.pytorch.free_benchmark.common.enums import DeviceType
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 class ScaleConst:
     """
@@ -43,8 +44,6 @@ class ScaleConst:
     COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
     
 class AutoLayer(NpuBaseLayer):
-    scale_var = 1.0
-    
     def check_catastrophe(self, tensor_obj):
         if isinstance(tensor_obj, torch.Tensor):
             if torch.all(tensor_obj.eq(0)):
@@ -115,56 +114,50 @@ class AutoLayer(NpuBaseLayer):
         return tensor_obj
     
     def handle(self, params: DataParams):
-        if not self.check_catastrophe(params.original_result):
-            params.perturbed_result = params.original_result
+        self.scale_factor = 1.0
+        params.perturbed_result = params.original_result
+        if not self.check_catastrophe(params.perturbed_result):
             return params.perturbed_result
-        
-        #! Try Scale
         logger.info_on_rank_0(
-            f"[msprobe] Free benchmark: An Problem shows here.\n"
-            f"[msprobe] Free benchmark: Perturbation is "
-            f"{PerturbationMode.AUTO} of {self.api_name}."
-            f"Trying Scale for this."
-        )
-        for x in ScaleConst.COMMUNICATION_NAMES:
-            if x in self.api_name:
-                params.perturbed_result=(params.original_result)/AutoLayer.scale_var
-                AutoLayer.scale_var = 1.0
-
-        if(self.check_catastrophe(params.args)):
+                f"[msprobe] Free benchmark: An Problem shows here. "
+            )
+        #! Try Scale
+        if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME in self.api_name:
             logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
-            params.perturbed_result = params.original_result
-            
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            new_args = params.args
-        else:
+                f"[msprobe] Free benchmark: Perturbation is "
+                f"{PerturbationMode.AUTO} of {self.api_name}. "
+                f"Trying Scale for this."
+            )
             new_args = self.tensor_scale(params.args)
-
-        params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
-
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)  
-            try:
-                new_args1 = params.perturbed_result,*new_args[1:]
-                params.perturbed_result = params.origin_func(*new_args1,
-                                                            **params.kwargs)
-            except KeyError as e:
+            params.perturbed_result = params.origin_func(
+                *new_args, **params.kwargs)
+    
+            if (ScaleConst.SOFTMAX_NAME in self.api_name):
+                params.perturbed_result = self.tensor_scale(
+                    params.perturbed_result, unscale=True)
+                try:
+                    new_args1 = params.perturbed_result, *new_args[1:]
+                    params.perturbed_result = params.origin_func(*new_args1,
+                                                                **params.kwargs)
+                except KeyError as e:
+                    logger.info_on_rank_0(
+                        f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
+            else:
+                params.perturbed_result = self.tensor_scale(
+                    params.perturbed_result, unscale=True)
+        
+            if not self.check_catastrophe(params.perturbed_result):
                 logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
-                
-        if not self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
-                f"Problem solved."
-            )
-            return params.perturbed_result
+                    f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
+                    f"Problem solved."
+                )
+                return params.perturbed_result
         
         #! Try improve precision
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: 'Scaler' is Useless. "
-                f"Trying to improve precision for "
+                f"Trying 'improve precision' for "
                 f"{PerturbationMode.AUTO} of {self.api_name}."
             )
             new_args = self.improve_tensor_precision(params.args)
@@ -187,7 +180,7 @@ class AutoLayer(NpuBaseLayer):
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
-                f"Trying Synchronize for "
+                f"Trying 'Synchronize' for "
                 f"{PerturbationMode.AUTO} of {self.api_name}."
             )
             torch_npu.npu.synchronize()
@@ -223,31 +216,33 @@ class AutoLayer(NpuBaseLayer):
         if self.check_catastrophe(params.perturbed_result):
             logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
-                f"Trying 'To_cpu' for"
-                f"{PerturbationMode.AUTO} of {self.api_name}."
+                f"Please set pert_mode to 'To_cpu' for further check."
             )
-            new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True)
-            new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True)
-            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
-            return params.perturbed_result
-
         return params.original_result
 
+    def _get_scale_factor(self, inputs):
+        upper = ScaleConst.SQRT_UB / 2
+        lower = torch.maximum(ScaleConst.SQRT_UB,
+                              torch.norm(inputs, p=1, dim=-1).max())
+        return upper / lower
+    
     def _scale(self, inputs):
-        self.scale_factor = (ScaleConst.SQRT_UB 
-                             / torch.maximum(ScaleConst.SQRT_UB,
-                                             torch.norm(inputs,p=1,dim=-1).max()))
-        scaled_inputs = inputs * self.scale_factor
-        AutoLayer.scale_var *= self.scale_factor
-        if AutoLayer.scale_var < ScaleConst.FP16_EPS:
-            AutoLayer.scale_var = ScaleConst.FP16_EPS
+        cur_scale = self._get_scale_factor(inputs)
+        self.scale_factor = max(ScaleConst.FP16_EPS,
+                                cur_scale * self.scale_factor)
+        scaled_inputs = inputs * cur_scale
         return scaled_inputs
     
     def _unscale(self, output):
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
-            AutoLayer.scale_var = 1.0
-            return unscaled_outputs
+        if (ScaleConst.SOFTMAX_NAME in self.api_name):
+            unscaled_outputs = (
+                torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
+        else:
+            rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
+                                                           torch.norm(output, p=1, dim=-1).max())
+            unscaled_outputs = rescale_coeff * output
+        return unscaled_outputs
+    
     def _set_improve_values(self, inputs):
         if inputs.dtype in [torch.float16, torch.bfloat16]:
             self.perturbed_value = torch.float32
-- 
Gitee


From faf46ff3d18d3920e521719662204c7ddcca8082 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 11:53:44 +0800
Subject: [PATCH 09/15] bmm adding

---
 .../pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 3a7b367627a..b5b4ff78e6e 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -34,6 +34,7 @@ class ScaleConst:
     SOFTMAX_NAME = "softmax"
     LINEAR_NAME = "linear"
     MATMUL_NAME = "matmul"
+    BMM_NAME = "bmm"
 
     FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
     FP16_UB = torch.finfo(torch.float16).max
@@ -122,7 +123,7 @@ class AutoLayer(NpuBaseLayer):
                 f"[msprobe] Free benchmark: An Problem shows here. "
             )
         #! Try Scale
-        if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME in self.api_name:
+        if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name:
             logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: Perturbation is "
                 f"{PerturbationMode.AUTO} of {self.api_name}. "
-- 
Gitee


From edfa2d8e0de72014042b53f8a12fb4fae9915bfe Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 12:11:22 +0800
Subject: [PATCH 10/15] bmm adding

---
 .../perturbed_layers/npu/auto_fix.py          | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index b5b4ff78e6e..73ea09c2abc 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -26,7 +26,7 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import
 from msprobe.pytorch.free_benchmark.common.utils import Tools
 from msprobe.pytorch.free_benchmark.common.enums import DeviceType
 from typing import Any, Callable, Dict, List, Optional, Tuple
-
+import numpy as np
 class ScaleConst:
     """
     Class for ScaleLayer's const
@@ -39,8 +39,8 @@ class ScaleConst:
     FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
     FP16_UB = torch.finfo(torch.float16).max
     
-    import numpy as np
-    SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
+
+    SQRT_UB = np.sqrt(FP16_UB)
     
     COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
     
@@ -222,10 +222,12 @@ class AutoLayer(NpuBaseLayer):
         return params.original_result
 
     def _get_scale_factor(self, inputs):
-        upper = ScaleConst.SQRT_UB / 2
-        lower = torch.maximum(ScaleConst.SQRT_UB,
-                              torch.norm(inputs, p=1, dim=-1).max())
-        return upper / lower
+        nominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
+        max_denominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
+        denominator = torch.maximum(max_denominator, torch.norm(inputs, p=1, dim=-1).max())
+        # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB)
+        min_scale_factor = torch.tensor([1 / ScaleConst.SQRT_UB], dtype=torch.float16).npu()
+        return torch.maximum(nominator / denominator, min_scale_factor)
     
     def _scale(self, inputs):
         cur_scale = self._get_scale_factor(inputs)
@@ -236,13 +238,12 @@ class AutoLayer(NpuBaseLayer):
     
     def _unscale(self, output):
         if (ScaleConst.SOFTMAX_NAME in self.api_name):
-            unscaled_outputs = (
-                torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
+            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
+ 
         else:
-            rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
-                                                           torch.norm(output, p=1, dim=-1).max())
-            unscaled_outputs = rescale_coeff * output
-        return unscaled_outputs
+            rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
+            unscaled_outputs = output / rescale_factor
+            unscaled_outputs = torch.nan_to_num(unscaled_outputs, posinf=np.sqrt(ScaleConst.FP16_UB))
     
     def _set_improve_values(self, inputs):
         if inputs.dtype in [torch.float16, torch.bfloat16]:
-- 
Gitee


From 17579b085e5d743e607df9067a6e32bc99c441ef Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 14:32:48 +0800
Subject: [PATCH 11/15] update 4 OP

---
 .../perturbed_layers/npu/auto_fix.py          | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index 73ea09c2abc..a5c1cd6d163 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -40,7 +40,7 @@ class ScaleConst:
     FP16_UB = torch.finfo(torch.float16).max
     
 
-    SQRT_UB = np.sqrt(FP16_UB)
+    SQRT_UB = torch.tensor([np.sqrt(FP16_UB)], dtype=torch.float16).npu()
     
     COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
     
@@ -122,6 +122,7 @@ class AutoLayer(NpuBaseLayer):
         logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: An Problem shows here. "
             )
+        
         #! Try Scale
         if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name:
             logger.info_on_rank_0(
@@ -222,12 +223,10 @@ class AutoLayer(NpuBaseLayer):
         return params.original_result
 
     def _get_scale_factor(self, inputs):
-        nominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
-        max_denominator = torch.tensor([ScaleConst.SQRT_UB], dtype=torch.float16).npu()
-        denominator = torch.maximum(max_denominator, torch.norm(inputs, p=1, dim=-1).max())
-        # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB)
-        min_scale_factor = torch.tensor([1 / ScaleConst.SQRT_UB], dtype=torch.float16).npu()
-        return torch.maximum(nominator / denominator, min_scale_factor)
+        upper = ScaleConst.SQRT_UB / 2
+        lower = torch.maximum(ScaleConst.SQRT_UB,
+                              torch.norm(inputs, p=1, dim=-1).max())
+        return upper / lower
     
     def _scale(self, inputs):
         cur_scale = self._get_scale_factor(inputs)
@@ -235,15 +234,16 @@ class AutoLayer(NpuBaseLayer):
                                 cur_scale * self.scale_factor)
         scaled_inputs = inputs * cur_scale
         return scaled_inputs
-    
+ 
     def _unscale(self, output):
         if (ScaleConst.SOFTMAX_NAME in self.api_name):
-            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
- 
+            unscaled_outputs = (
+                torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
         else:
-            rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
-            unscaled_outputs = output / rescale_factor
-            unscaled_outputs = torch.nan_to_num(unscaled_outputs, posinf=np.sqrt(ScaleConst.FP16_UB))
+            rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
+                                                           torch.norm(output, p=1, dim=-1).max())
+            unscaled_outputs = rescale_coeff * output
+        return unscaled_outputs
     
     def _set_improve_values(self, inputs):
         if inputs.dtype in [torch.float16, torch.bfloat16]:
-- 
Gitee


From 61b6f056e98cd89ad6ff988300811047c2d0b336 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 26 Nov 2024 14:38:52 +0800
Subject: [PATCH 12/15] update readme

---
 debug/accuracy_tools/msprobe/docs/02.config_introduction.md     | 2 +-
 .../accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index bd07f611059..a5d9e27062f 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -103,7 +103,7 @@ PyTorch 与 MindSpore 动态图场景下，"level"须为"L1"；MindSpore 静态
     <tr><th>参数</th><th>解释</th><th>是否必选</th></tr>
     <tr><td>scope</td><td>自定义检测 API 列表（仅 PyTorch 场景支持），list[str] 类型，默认值为空列表，当 list 也为空列表时，表示检测所有 API。需要在 [ ] 内配置具体 API 名（在 dump 的结果中查看）。与 list 参数不能同时配置。<br/><b>配置示例</b>："scope": ["Torch.matmul.0.forward", "Tensor.pow.4.forward"]。</td><td>否</td></tr>
     <tr><td rowspan="3">list</td><td>自定义检测 API 类型或 API 名称，list[str] 类型，默认值为空列表，表示检测所有 API（PyTorch 场景下还需 scope 也为空列表）。与 scope 参数不能同时配置。</td><td rowspan="3">否</td></tr>
-    <tr><td>PyTorch 场景：指定某一类 API，对某一类的 API 进行无标杆比对。<br/><b>配置示例</b>："list": ["relu"]。</td></tr>
+    <tr><td>PyTorch 场景：指定某一类 API，对某一类的 API 进行无标杆比对。<br/><b>配置示例</b>："list": ["relu"]。针对任务auto_fix, 须知其中scale功能支持matmul，bmm，softmax，linear，其他算子将跳过scale使用其他修复方法。</td></tr>
     <tr><td>MindSpore 场景：指定 API 名称，对列表中的 API 进行检测。<br/><b>配置示例</b>："list": ["mindspore.mint.div", "mindspore.ops.bmm", "mindspore.Tensor.__add__"]。</td></tr>
     <tr><td>fuzz_device</td><td>标杆设备，str 类型。可选参数：<br/>        "npu"：无标杆，通过添加扰动因子进行比对，默认值；<br/>        "cpu"：以 CPU 为标杆，pert_mode 须配置为"to_cpu"（仅 PyTorch 场景支持）。<br/><b>配置示例</b>："fuzz_device": "npu"。</td><td>否</td></tr>
     <tr><td>pert_mode</td><td>无标杆扰动因子，str 类型。可选参数：<br/>  "improve_precision"：对输入做升精度，默认值；<br/>        "add_noise"：对输入增加噪声；<br/>        "no_change"：不加扰动直接二次执行；<br/>        "bit_noise"：输入的末位比特翻转，MindSpore 场景不支持 BF16 类型的向量；<br/>        "change_value"：输入的张量首尾值调换；<br/>        "to_cpu"：在 CPU 等价执行（仅 PyTorch 场景支持）。<br/>        "auto_fix"：使用scale、切精度、同步等方法快速排除和恢复算子问题。<br/><b>配置示例</b>："pert_mode": "improve_precision"。</td><td>否</td></tr>
diff --git a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
index 6a590f4b093..bd345813400 100644
--- a/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/15.free_benchmarking_PyTorch.md
@@ -88,7 +88,7 @@ D-->config.json配置
 <table>
     <tr><th>参数</th><th>是否必选</th><th>可配置项</th><th>适用场景</th></tr>
     <tr><td>scope</td><td>否</td><td>自定义</td><td>需要通过指定算子名来限制算子插桩范围 如：["Torch.matmul.0.forward", "Tensor.pow.4.forward"]。</td></tr>
-    <tr><td>list</td><td>否</td><td>自定义</td><td>需要通过指定算子类型来限制算子插桩范围 如：["relu"] 会匹配所有算子名中包含relu的算子。</td></tr>
+    <tr><td>list</td><td>否</td><td>自定义</td><td>需要通过指定算子类型来限制算子插桩范围 如：["relu"] 会匹配所有算子名中包含relu的算子。针对任务auto_fix, 须知其中scale功能支持matmul，bmm，softmax，linear，其他算子将跳过scale使用其他修复方法。</td></tr>
     <tr><td rowspan="2">fuzz_stage</td><td rowspan="2">否</td><td>"forward"（默认）</td><td>需要进行算子<b>前向</b>计算的精度问题排查或<b>验证可疑算子。</b></td></tr>
     <tr><td>"backward"</td><td>需要进行算子<b>反向</b>计算的精度问题排查，不支持仅反向验证，前向验证包括反向。</td><td></td></tr>
 </table>
-- 
Gitee


From 2f8c3111a160ee6076c5c69e5fd1c9afa732bf1c Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Thu, 28 Nov 2024 10:44:25 +0800
Subject: [PATCH 13/15] backward scale fix support

---
 .../pytorch/free_benchmark/common/utils.py    |  2 +-
 .../perturbed_layers/npu/auto_fix.py          | 25 +++++++++++--------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
index ac00bd13f6e..3dbded07e94 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/common/utils.py
@@ -77,7 +77,7 @@ class Tools:
     def convert_fuzz_output_to_origin(origin, perturbed, pert_mode):
         if isinstance(origin, torch.Tensor) and isinstance(perturbed, torch.Tensor):
             if pert_mode == PerturbationMode.AUTO:
-                origin.data = perturbed
+                origin = perturbed
                 return origin 
             origin.data = perturbed.to(origin.dtype).to(origin.device)
             return origin
diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index a5c1cd6d163..a210e2aaf69 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -41,6 +41,7 @@ class ScaleConst:
     
 
     SQRT_UB = torch.tensor([np.sqrt(FP16_UB)], dtype=torch.float16).npu()
+    SQRT_UB_INV = torch.tensor([1 / np.sqrt(FP16_UB)], dtype=torch.float16).npu()
     
     COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
     
@@ -221,12 +222,17 @@ class AutoLayer(NpuBaseLayer):
                 f"Please set pert_mode to 'To_cpu' for further check."
             )
         return params.original_result
-
+   
     def _get_scale_factor(self, inputs):
-        upper = ScaleConst.SQRT_UB / 2
-        lower = torch.maximum(ScaleConst.SQRT_UB,
-                              torch.norm(inputs, p=1, dim=-1).max())
-        return upper / lower
+        nominator = ScaleConst.SQRT_UB
+        x_norm = torch.norm(inputs, p=1, dim=-1).max()
+        if(torch.isfinite(x_norm).all()):
+            denominator = torch.maximum(ScaleConst.SQRT_UB, x_norm)
+        else:
+            # if L1 norm of inputs is inf scale to 1 / sqrt(FP16_UB)
+            return ScaleConst.SQRT_UB_INV.to(torch.get_device(inputs))
+        computed_scale = (nominator / denominator).to(torch.get_device(inputs))
+        return computed_scale
     
     def _scale(self, inputs):
         cur_scale = self._get_scale_factor(inputs)
@@ -237,12 +243,11 @@ class AutoLayer(NpuBaseLayer):
  
     def _unscale(self, output):
         if (ScaleConst.SOFTMAX_NAME in self.api_name):
-            unscaled_outputs = (
-                torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor).detach()
+            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
         else:
-            rescale_coeff = 0.5 * ScaleConst.FP16_UB / min(0.5*ScaleConst.FP16_UB,
-                                                           torch.norm(output, p=1, dim=-1).max())
-            unscaled_outputs = rescale_coeff * output
+            rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
+            unscaled_outputs = output / rescale_factor
+ 
         return unscaled_outputs
     
     def _set_improve_values(self, inputs):
-- 
Gitee


From 870867b0459c06d46e4e0f997a2715aa92e6df26 Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 24 Dec 2024 11:53:14 +0800
Subject: [PATCH 14/15] check-cla

---
 .../perturbed_layers/npu/auto_fix.py          | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
index a210e2aaf69..4941af49719 100644
--- a/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
+++ b/debug/accuracy_tools/msprobe/pytorch/free_benchmark/perturbed_layers/npu/auto_fix.py
@@ -15,18 +15,19 @@
 
 import torch
 import torch_npu
+import numpy as np
+
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import NpuBaseLayer
+from msprobe.pytorch.free_benchmark.common.utils import Tools
+from msprobe.pytorch.free_benchmark.common.enums import DeviceType
 from msprobe.core.common.const import Const
 from msprobe.pytorch.free_benchmark import logger
 from msprobe.pytorch.free_benchmark.common.constant import CommonField
 from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
 from msprobe.pytorch.free_benchmark.common.params import DataParams
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
-    NpuBaseLayer,
-)
-from msprobe.pytorch.free_benchmark.common.utils import Tools
-from msprobe.pytorch.free_benchmark.common.enums import DeviceType
-from typing import Any, Callable, Dict, List, Optional, Tuple
-import numpy as np
+
 class ScaleConst:
     """
     Class for ScaleLayer's const
@@ -116,6 +117,13 @@ class AutoLayer(NpuBaseLayer):
         return tensor_obj
     
     def handle(self, params: DataParams):
+        is_scale_applicable = (
+            ScaleConst.SOFTMAX_NAME in self.api_name or
+            ScaleConst.LINEAR_NAME in self.api_name or
+            ScaleConst.MATMUL_NAME in self.api_name or
+            ScaleConst.BMM_NAME in self.api_name
+        )
+        
         self.scale_factor = 1.0
         params.perturbed_result = params.original_result
         if not self.check_catastrophe(params.perturbed_result):
@@ -125,7 +133,7 @@ class AutoLayer(NpuBaseLayer):
             )
         
         #! Try Scale
-        if ScaleConst.SOFTMAX_NAME in self.api_name or ScaleConst.LINEAR_NAME in self.api_name or ScaleConst.MATMUL_NAME or ScaleConst.BMM_NAME in self.api_name:
+        if is_scale_applicable:
             logger.info_on_rank_0(
                 f"[msprobe] Free benchmark: Perturbation is "
                 f"{PerturbationMode.AUTO} of {self.api_name}. "
@@ -244,11 +252,11 @@ class AutoLayer(NpuBaseLayer):
     def _unscale(self, output):
         if (ScaleConst.SOFTMAX_NAME in self.api_name):
             unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / self.scale_factor)
+            return unscaled_outputs
         else:
             rescale_factor = max(self.scale_factor, (torch.max(output)) / ScaleConst.SQRT_UB)
             unscaled_outputs = output / rescale_factor
- 
-        return unscaled_outputs
+            return unscaled_outputs
     
     def _set_improve_values(self, inputs):
         if inputs.dtype in [torch.float16, torch.bfloat16]:
-- 
Gitee


From 67949d7af3ade77b1c0d4bfff508b9cd549cab3c Mon Sep 17 00:00:00 2001
From: pengyunke <14310393+pengyunke123@user.noreply.gitee.com>
Date: Tue, 24 Dec 2024 14:28:07 +0800
Subject: [PATCH 15/15] 12/24

---
 tmp.py | 258 ---------------------------------------------------------
 1 file changed, 258 deletions(-)
 delete mode 100644 tmp.py

diff --git a/tmp.py b/tmp.py
deleted file mode 100644
index 1102f8cfeb6..00000000000
--- a/tmp.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch_npu
-from msprobe.core.common.const import Const
-from msprobe.pytorch.free_benchmark import logger
-from msprobe.pytorch.free_benchmark.common.constant import CommonField
-from msprobe.pytorch.free_benchmark.common.enums import PerturbationMode
-from msprobe.pytorch.free_benchmark.common.params import DataParams
-from msprobe.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
-    NpuBaseLayer,
-)
-
-class ScaleConst:
-    """
-    Class for ScaleLayer's const
-    """
-    SOFTMAX_NAME = "softmax"
-    LINEAR_NAME = "linear"
-    MATMUL_NAME = "matmul"
-
-    FP16_EPS = torch.finfo(torch.float16).tiny # TODO dtype fix
-    FP16_UB = torch.finfo(torch.float16).max
-    
-    import numpy as np
-    SQRT_UB = torch.tensor([np.sqrt(FP16_UB)],dtype=torch.float16).npu()
-    
-    COMMUNICATION_NAMES = ["all_reduce","all_gather","reduce_scatter"]
-    
-class AutoLayer(NpuBaseLayer):
-    scale_var = 1.0
-    
-    def check_catastrophe(self, tensor_obj):
-        if isinstance(tensor_obj, torch.Tensor):
-            if torch.all(tensor_obj.eq(0)):
-                return True
-            if torch.isinf(tensor_obj).any():
-                return True
-            if torch.isnan(tensor_obj).any():
-                return True
-            return False
-        if isinstance(tensor_obj, dict):
-            return any(self.check_catastrophe(value) for value in tensor_obj.values())
-        if isinstance(tensor_obj, (tuple, list)):
-            return any(self.check_catastrophe(value) for value in tensor_obj)
-        return False
-
-    def tensor_scale(self, tensor_obj,unscale=False):
-        if isinstance(tensor_obj, torch.Tensor):
-            if(unscale):
-                tensor_obj = self._unscale(tensor_obj)
-            else:
-                tensor_obj = self._scale(tensor_obj)
-                self.is_added = True
-            return tensor_obj
-        if isinstance(tensor_obj, dict):
-            return {
-                key: self.tensor_scale(value) 
-                for key, value in tensor_obj.items()
-            }
-        if isinstance(tensor_obj, (tuple, list)):
-            return type(tensor_obj)(
-                [self.tensor_scale(value) for value in tensor_obj]
-            )
-        return tensor_obj
-    
-    def tensor_contiguous(self, tensor_obj):
-        if isinstance(tensor_obj, torch.Tensor):
-            return tensor_obj.contiguous()
-        if isinstance(tensor_obj, dict):
-            return {
-                key: self.tensor_contiguous(value) 
-                for key, value in tensor_obj.items()
-            }
-        if isinstance(tensor_obj, (tuple, list)):
-            return type(tensor_obj)(
-                [self.tensor_contiguous(value) for value in tensor_obj]
-            )
-        return tensor_obj 
-    
-    def improve_tensor_precision(self, tensor_obj):
-        if (
-            isinstance(tensor_obj, torch.Tensor)
-            and torch.is_floating_point(tensor_obj)
-            and tensor_obj.dtype not in [torch.float32, torch.float64]
-        ):
-            self._set_improve_values(tensor_obj)
-            tensor_obj = self._change_dtype(tensor_obj)
-            self.is_added = True
-            return tensor_obj
-        if isinstance(tensor_obj, dict):
-            return {
-                key: self.improve_tensor_precision(value)
-                for key, value in tensor_obj.items()
-            }
-        if isinstance(tensor_obj, (tuple, list)):
-            return type(tensor_obj)(
-                [self.improve_tensor_precision(value) for value in tensor_obj]
-            )
-        return tensor_obj
-    
-    def handle(self, params: DataParams):
-        if not self.check_catastrophe(params.original_result):
-            params.perturbed_result = params.original_result
-            return params.perturbed_result
-        
-        #! Try Scale
-        logger.info_on_rank_0(
-            f"[msprobe] Free benchmark: An Problem shows here.\n"
-            f"[msprobe] Free benchmark: Perturbation is "
-            f"{PerturbationMode.AUTO} of {self.api_name}."
-            f"Trying Scale for this."
-        )
-        for x in ScaleConst.COMMUNICATION_NAMES:
-            if x in self.api_name:
-                params.perturbed_result=(params.original_result)/AutoLayer.scale_var
-                AutoLayer.scale_var = 1.0
-
-        if(self.check_catastrophe(params.args)):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: INPUT is not valid - Scaling is useless!")
-            params.perturbed_result = params.original_result
-            
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            new_args = params.args
-        else:
-            new_args = self.tensor_scale(params.args)
-
-        params.perturbed_result = params.origin_func(*new_args, **params.kwargs)
-
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            params.perturbed_result = self.tensor_scale(params.perturbed_result,unscale=True)  
-            try:
-                new_args1 = params.perturbed_result,*new_args[1:]
-                params.perturbed_result = params.origin_func(*new_args1,
-                                                            **params.kwargs)
-            except KeyError as e:
-                logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Something was wrong during softmax recalc!!!")
-                
-        if not self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Autofix-'Scaler' is Useful, "
-                f"Problem solved."
-            )
-            return params.perturbed_result
-
-        #! Try improve precision
-        if self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: 'Scaler' is Useless. "
-                f"Trying to improve precision for "
-                f"{PerturbationMode.AUTO} of {self.api_name}."
-            )
-            new_args = self.improve_tensor_precision(params.args)
-            if params.fuzz_stage == Const.BACKWARD:
-                new_kwargs = {}
-            else:
-                new_kwargs = self.improve_tensor_precision(params.kwargs)
-            if "inplace" in new_kwargs:
-                new_kwargs["inplace"] = False
-            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
-        
-        if not self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Autofix-'Improve Precision' is Useful, "
-                f"Problem solved."
-            )
-            return params.perturbed_result
-        
-        #! Try Synchronize
-        if self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: 'Improve Precision' is Useless "
-                f"Trying Synchronize for "
-                f"{PerturbationMode.AUTO} of {self.api_name}."
-            )
-            torch_npu.npu.synchronize()
-            params.perturbed_result = params.origin_func(*params.args, **params.kwargs)
-            torch_npu.npu.synchronize()
-            
-        if not self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Autofix-'Synchronize' is Useful, "
-                f"Problem solved."
-            )
-            return params.perturbed_result
-        
-        #! Try Contiguous
-        if self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: 'Synchronize' is Useless, too. "
-                f"Trying 'Contiguous' for"
-                f"{PerturbationMode.AUTO} of {self.api_name}."
-            )
-            new_args = self.tensor_contiguous(params.args)
-            new_kwargs = self.tensor_contiguous(params.kwargs)
-            params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
-        
-        if not self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: Autofix-'Contiguous' is Useful, "
-                f"Problem solved."
-            )
-            return params.perturbed_result
-        
-        #! Hint to 'tocpu'
-        if self.check_catastrophe(params.perturbed_result):
-            logger.info_on_rank_0(
-                f"[msprobe] Free benchmark: 'Contiguous' is Useless, too. "
-                f"Please set pert_mode to 'tocpu' for further check."
-            )
-        return params.original_result
-
-    def _scale(self, inputs):
-        self.scale_factor = (ScaleConst.SQRT_UB 
-                             / torch.maximum(ScaleConst.SQRT_UB,
-                                             torch.norm(inputs,p=1,dim=-1).max()))
-        scaled_inputs = inputs * self.scale_factor
-        AutoLayer.scale_var *= self.scale_factor
-        if AutoLayer.scale_var < ScaleConst.FP16_EPS:
-            AutoLayer.scale_var = ScaleConst.FP16_EPS
-        return scaled_inputs
-    
-    def _unscale(self, output):
-        if(ScaleConst.SOFTMAX_NAME in self.api_name):
-            unscaled_outputs = (torch.log(output + ScaleConst.FP16_EPS) / AutoLayer.scale_var).detach()
-            AutoLayer.scale_var = 1.0
-            return unscaled_outputs
-    def _set_improve_values(self, inputs):
-        if inputs.dtype in [torch.float16, torch.bfloat16]:
-            self.perturbed_value = torch.float32
-
-    def _change_dtype(self, inputs):
-        if hasattr(inputs, CommonField.DEVICE):
-            device = inputs.device
-            if device is CommonField.META:
-                new_inputs = inputs.to(
-                    device=CommonField.META, dtype=self.perturbed_value
-                )
-            else:
-                new_inputs = inputs.to(dtype=self.perturbed_value).to(device)
-        else:
-            new_inputs = inputs.to(dtype=self.perturbed_value)
-        return new_inputs
\ No newline at end of file
-- 
Gitee