From 4215941c4134063a52543e87a8590109e9266b16 Mon Sep 17 00:00:00 2001 From: niujunhao Date: Mon, 14 Apr 2025 20:06:04 +0800 Subject: [PATCH] fix dataset is_dynamic doc --- docs/mindformers/docs/source_en/function/dataset.md | 6 +++--- docs/mindformers/docs/source_zh_cn/function/dataset.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/mindformers/docs/source_en/function/dataset.md b/docs/mindformers/docs/source_en/function/dataset.md index 8c057b218d..1fb0722f1d 100644 --- a/docs/mindformers/docs/source_en/function/dataset.md +++ b/docs/mindformers/docs/source_en/function/dataset.md @@ -379,7 +379,6 @@ train_dataset: &train_dataset shuffle: False split: "train" path: "llm-wizard/alpaca-gpt4-data" - is_dynamic: False packing: pack handler: - type: AlpacaInstructDataHandler @@ -387,11 +386,12 @@ train_dataset: &train_dataset seq_length: 4096 prompt_key: "conversations" output_columns: ["input_ids", "labels"] + is_dynamic: False - type: PackingHandler seq_length: 4096 output_columns: ["input_ids", "labels", "actual_seq_len"] adaptor_config: - compress_mask: False + compress_mask: False column_names: *input_columns ``` @@ -437,13 +437,13 @@ When packing is configured, the dataset returns an `actual_seq_len` column. For shuffle: True split: "train" # Subset name of the online dataset path: "llm-wizard/alpaca-gpt4-data" # Online dataset name - is_dynamic: True handler: - type: AlpacaInstructDataHandler tokenizer_name: llama2_7b seq_length: 4096 prompt_key: "conversations" output_columns: *input_columns + is_dynamic: True seed: 0 num_parallel_workers: 8 python_multiprocessing: False diff --git a/docs/mindformers/docs/source_zh_cn/function/dataset.md b/docs/mindformers/docs/source_zh_cn/function/dataset.md index fca326a4e3..ddd7fbe3c1 100644 --- a/docs/mindformers/docs/source_zh_cn/function/dataset.md +++ b/docs/mindformers/docs/source_zh_cn/function/dataset.md @@ -379,7 +379,6 @@ train_dataset: &train_dataset shuffle: False split: "train" path: "llm-wizard/alpaca-gpt4-data" - is_dynamic: False packing: pack handler: - type: AlpacaInstructDataHandler @@ -387,11 +386,12 @@ train_dataset: &train_dataset seq_length: 4096 prompt_key: "conversations" output_columns: ["input_ids", "labels"] + is_dynamic: False - type: PackingHandler seq_length: 4096 output_columns: ["input_ids", "labels", "actual_seq_len"] adaptor_config: - compress_mask: False + compress_mask: False column_names: *input_columns ``` @@ -437,13 +437,13 @@ train_dataset: &train_dataset shuffle: True split: "train" # 在线数据集子集名称 path: "llm-wizard/alpaca-gpt4-data" # 在线数据集名称 - is_dynamic: True handler: - type: AlpacaInstructDataHandler tokenizer_name: llama2_7b seq_length: 4096 prompt_key: "conversations" output_columns: *input_columns + is_dynamic: True seed: 0 num_parallel_workers: 8 python_multiprocessing: False -- Gitee