From 1f6835739c28f9cabfc3323e73130ec1ccc32b2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AD=8F=E7=90=A2=E8=89=BA?= <weizhuoyi1@huawei.com>
Date: Wed, 16 Jul 2025 16:48:48 +0800
Subject: [PATCH] dataset script fix

---
 docs/mindformers/docs/source_en/feature/dataset.md    | 4 ++--
 docs/mindformers/docs/source_zh_cn/feature/dataset.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/mindformers/docs/source_en/feature/dataset.md b/docs/mindformers/docs/source_en/feature/dataset.md
index 4b252da969..55b0997ad8 100644
--- a/docs/mindformers/docs/source_en/feature/dataset.md
+++ b/docs/mindformers/docs/source_en/feature/dataset.md
@@ -99,7 +99,7 @@ The following example demonstrates how to convert the `wikitext-103` dataset int
    Take [LlamaTokenizerFast](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer_config.json) and [vocab file](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer.json) in [DeepSeek-V3 repository](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base) as an example. If there is no corresponding repository, configuration file (tokenizer_config.json) and vocab file (tokenizer.json) needed to be download to local path. Let it be /path/to/huggingface/tokenizer. Execute the following command to preprocess the dataset:
 
    ```shell
-   python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
      --input /path/data.json \
      --output-prefix /path/megatron_data \
      --tokenizer-type HuggingFaceTokenizer \
@@ -109,7 +109,7 @@ The following example demonstrates how to convert the `wikitext-103` dataset int
    Take outer tokenizer class [Llama3Tokenizer](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1/llama3_1_tokenizer.py) as an example, make sure **local** MindSpore Transformers repository has 'research/llama3_1/llama3_1_tokenizer.py', and execute the following command to preprocess the dataset:
 
    ```shell
-   python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
      --input /path/data.json \
      --output-prefix /path/megatron_data \
      --tokenizer-type AutoRegister \
diff --git a/docs/mindformers/docs/source_zh_cn/feature/dataset.md b/docs/mindformers/docs/source_zh_cn/feature/dataset.md
index 47dbe009d2..2b3133127e 100644
--- a/docs/mindformers/docs/source_zh_cn/feature/dataset.md
+++ b/docs/mindformers/docs/source_zh_cn/feature/dataset.md
@@ -96,7 +96,7 @@ MindSpore Transformers提供了数据预处理脚本[preprocess_indexed_dataset.
    以[Deepseek-V3仓库](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base)中的[LlamaTokenizerFast](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer_config.json)和[词表](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer.json)为例。如果本地不存在对应仓库，需要将配置文件(tokenizer_config.json)和词表文件(tokenizer.json)手动下载到本地目录，假设为/path/to/huggingface/tokenizer。执行如下命令处理数据集：
 
    ```shell
-   python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
      --input /path/data.json \
      --output-prefix /path/megatron_data \
      --tokenizer-type HuggingFaceTokenizer \
@@ -106,7 +106,7 @@ MindSpore Transformers提供了数据预处理脚本[preprocess_indexed_dataset.
    以外部tokenizer类[Llama3Tokenizer](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1/llama3_1_tokenizer.py)为例，确保**本地**mindformers仓库下存在'research/llama3_1/llama3_1_tokenizer.py'，执行如下命令处理数据集：
 
    ```shell
-   python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
      --input /path/data.json \
      --output-prefix /path/megatron_data \
      --tokenizer-type AutoRegister \
-- 
Gitee