From d1011faabc6bfa13c8e4a860150587589f78ade1 Mon Sep 17 00:00:00 2001
From: chujinjin <chujinjin52@huawei.com>
Date: Tue, 24 Sep 2024 09:38:24 +0800
Subject: [PATCH] add stress detect doc

---
 .../model_train/debug/stress_detect.md        | 116 ++++++++++++++++++
 .../source_zh_cn/model_train/index.rst        |   1 +
 2 files changed, 117 insertions(+)
 create mode 100644 docs/mindspore/source_zh_cn/model_train/debug/stress_detect.md

diff --git a/docs/mindspore/source_zh_cn/model_train/debug/stress_detect.md b/docs/mindspore/source_zh_cn/model_train/debug/stress_detect.md
new file mode 100644
index 0000000000..eafa831447
--- /dev/null
+++ b/docs/mindspore/source_zh_cn/model_train/debug/stress_detect.md
@@ -0,0 +1,116 @@
+# 硬件精度在线检测
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/model_train/debug/stress_detect.md)
+
+## 概述
+
+在昇腾硬件中，存在硬件静默失败导致的精度问题，精度问题如果不能及时检出，造成的影响非常大。针对该类硬件精度问题，提供了一个API，用于检测硬件精度是否有异常。
+
+> 目前仅支持昇腾硬件。
+
+## 使用方法
+
+### 接口原型及基本使用方法
+
+```python
+mindspore.utils.stress_detect()
+```
+
+详见接口说明[文档](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore.utils.html)
+
+### 输出说明
+
+返回值为int，含义如下：
+
+- 0：检测结果通过
+- 174003：压测前处理失败
+- 174004：压测不支持并发
+- 274001：驱动版本不适配
+- 274002：芯片不支持该功能
+- 374002：调压失败
+- 374003：压测用例执行失败
+- 574006：精度对比异常
+- 574007：电压恢复失败
+- 其他：参考[ACL错误码说明](https://www.hiascend.com/document/detail/zh/canncommercial/80RC22/developmentguide/maintenref/troubleshooting/atlaserrorcode_15_0234.html)
+
+### 约束说明
+
+1、硬件精度在线检测的使用需要修改用户的模型训练脚本，建议在训练开始前、结束后、两个step之间调用，同时需要预留2G大小的显存供压测接口使用。
+2、硬件精度在线检测在集群所有节点之间要同步并发执行，所有节点的执行时间偏差控制在1秒级（推荐），否则会存在慢节点导致NotifyWait超时问题。
+3、硬件精度检测用例存在提前检测出芯片Weak点，可能会提前时间N个月发现并送厂返修，会提高返修率。
+4、假如没有其他方式获取恢复的CKPT点，建议回滚到上一个硬件精度检测的时间点时的CKPT。
+5、建议保证一天（24小时）执行一次。
+6、硬件精度检测用例建议一直运行，基于硬件失效的浴盆曲线，即使在盆底还是存在失效，只是失效率变低，周期变长。
+7、需要根据调压的压测效果决策是否需要保留调压。当前离线AIC压测工具检测效果率为70~80%。
+8、硬件精度在线检测用例仅支持Atlas A2训练系列产品，不支持在同一个节点运行过个训练作业，不建议使用多线程运行在线精度检测用例。
+9、在进行硬件精度检测的时候会触发SOC调压(偏移芯片额定电压)，SOC调压后需要重新初始化ACG，而初始化ACG时需要调整频率(1850M到1300M)，因此可以看到频率变化，甚至超频。
+10、BUS电压设备热复位不支持自动恢复，需要设备上下电才能恢复，建议做训练作业前环境检测。
+
+### 调用示例
+
+```python
+import numpy as np
+import mindspore as ms
+from mindspore import nn
+
+# Define Model
+class Network(nn.Cell):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.dense_relu_sequential = nn.SequentialCell(
+            nn.Dense(28*28, 512),
+            nn.ReLU(),
+            nn.Dense(512, 512),
+            nn.ReLU(),
+            nn.Dense(512, 10)
+        )
+
+    def construct(self, x):
+        x = self.flatten(x)
+        logits = self.dense_relu_sequential(x)
+        return logits
+
+model = Network()
+
+# Instantiate loss function and optimizer
+loss_fn = nn.CrossEntropyLoss()
+optimizer = nn.SGD(model.trainable_params(), 1e-2)
+
+# 1. Define forward function
+def forward_fn(data, label):
+    logits = model(data)
+    loss = loss_fn(logits, label)
+    return loss, logits
+
+# 2. Get gradient function
+grad_fn = ms.value_and_grad(foward_fn, None, optimizer.parameters, has_aux=True)
+
+# 3. Define function of one-step training
+def train_step(data, label):
+    (loss, _), grads = grad_fn(data, label)
+    optimizer(grads)
+    return loss
+
+def train(model, dataset):
+    model.set_train()
+    for data, label in dataset:
+        loss = train_step(data, label)
+    return loss
+
+# Sample dataset
+train_dataset = [(ms.Tensor(np.random.rand(64,1,28,28).astype(np.float32)), ms.Tensor(np.random.randin(10, size=(64,)).astype(np.int32))) for _ in range(100)]
+
+epochs = 10
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-----------")
+    loss = train(model, train_dataset)
+    print(f"Epoch {t+1} Loss: {loss}")
+
+    #Call Stress detect for each epoch
+    ret = ms.utils.stress_detect()
+    if ret == 0:
+        print(f"Epoch {t+1}: Stress detection passed")
+    else:
+        print(f"Epoch {t+1}: Stress detection failed with error code: {ret}")
+```
\ No newline at end of file
diff --git a/docs/mindspore/source_zh_cn/model_train/index.rst b/docs/mindspore/source_zh_cn/model_train/index.rst
index bd5b3255aa..dd8f386c64 100644
--- a/docs/mindspore/source_zh_cn/model_train/index.rst
+++ b/docs/mindspore/source_zh_cn/model_train/index.rst
@@ -93,6 +93,7 @@
    debug/overview
    debug/dump
    debug/rdr
+   debug/stress_detect
    debug/sdc
    debug/error_analysis
    debug/pynative
-- 
Gitee