From 7c98e4badcef437dc9511b4a4dc79faa5e764c78 Mon Sep 17 00:00:00 2001 From: buxue Date: Wed, 23 Apr 2025 16:35:50 +0800 Subject: [PATCH] add TRE for MS_ENABLE_TFT --- docs/mindformers/docs/source_en/function/high_availability.md | 1 + docs/mindformers/docs/source_zh_cn/function/high_availability.md | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/mindformers/docs/source_en/function/high_availability.md b/docs/mindformers/docs/source_en/function/high_availability.md index 72b5cc64b4..934bb13861 100644 --- a/docs/mindformers/docs/source_en/function/high_availability.md +++ b/docs/mindformers/docs/source_en/function/high_availability.md @@ -40,6 +40,7 @@ export MS_TFT_PORT=30051 - **TTP (Try To Persist)**: End-of-life CKPT function - **UCE (Uncorrectable Memory Error)**: UCE fault tolerance recovery - **ARF (Air Refuelling)**: Process-level rescheduling recovery function + - **TRE(Train Result Error)**: TRE fault tolerance recovery - When UCE or ARF is enabled, TTP is enabled by default. - `MS_TFT_IP` and `MS_TFT_PORT` represent the IP and port number of TFT Controller respectively, no default value, need to be specified by user. If the Controller is started by MindSpore Transformers, the IP and port number of the rank0 node in the user's cluster are configured. If the Controller is started by the user, configure the IP and port number of the Controller. diff --git a/docs/mindformers/docs/source_zh_cn/function/high_availability.md b/docs/mindformers/docs/source_zh_cn/function/high_availability.md index bf6f64b70a..93b359a5f4 100644 --- a/docs/mindformers/docs/source_zh_cn/function/high_availability.md +++ b/docs/mindformers/docs/source_zh_cn/function/high_availability.md @@ -40,6 +40,7 @@ export MS_TFT_PORT=30051 - **TTP (Try To Persist)**:临终 CKPT 功能 - **UCE (Uncorrectable Memory Error)**:UCE 故障容错恢复功能 - **ARF (Air Refuelling)**:进程级重调度恢复功能 + - **TRE(Train Result Error)**:TRE 故障容错恢复功能 - 开启 UCE 或者 ARF 功能时,默认开启 TTP 功能 - `MS_TFT_IP` 和 `MS_TFT_PORT` 分别表示 TFT Controller 的 IP 和端口号,无默认值,需要用户指定。如果由 MindSpore Transformers 启动 Controller,则配置用户集群中 rank0 节点的 IP 和端口号。如果用户自行启动 Controller,则配置 Controller 的 IP 和端口号。 -- Gitee