From fc6f7e036658b33b79c57ac1b7becf1769a7471f Mon Sep 17 00:00:00 2001 From: huan <3174348550@qq.com> Date: Fri, 18 Jul 2025 17:15:46 +0800 Subject: [PATCH] modify file contents --- .../source_en/beginner/accelerate_with_static_graph.md | 2 +- tutorials/source_en/dataset/cache.md | 8 ++++---- tutorials/source_en/dataset/eager.md | 2 +- tutorials/source_en/dataset/optimize.ipynb | 6 +++--- tutorials/source_en/dataset/python_objects.md | 6 +++--- tutorials/source_en/dataset/record.ipynb | 4 ++-- tutorials/source_en/dataset/sampler.md | 4 ++-- tutorials/source_zh_cn/dataset/eager.ipynb | 2 +- tutorials/source_zh_cn/dataset/python_objects.ipynb | 2 +- tutorials/source_zh_cn/dataset/sampler.ipynb | 2 +- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tutorials/source_en/beginner/accelerate_with_static_graph.md b/tutorials/source_en/beginner/accelerate_with_static_graph.md index 94162ff4c5..3315bb8992 100644 --- a/tutorials/source_en/beginner/accelerate_with_static_graph.md +++ b/tutorials/source_en/beginner/accelerate_with_static_graph.md @@ -12,7 +12,7 @@ The AI compilation framework is divided into two modes of operation: dynamic gra Dynamic graphs are characterized by the construction of the computational graph and computation occurring at the same time (Define by run), which is in line with Python interpreted execution. When defining a Tensor in the computational graph, its value is computed and determined, so it is more convenient to debug the model, and can get the value of the intermediate results in real time, but it is difficult to optimize the whole computational graph because the fact that all the nodes need to be saved. -In MindSpore, dynamic graph mode is also known as PyNative mode. Due to the interpreted execution of dynamic graphs, it is recommended to use dynamic graph mode for debugging during script development and network process debugging. +In MindSpore, dynamic graph mode is also known as PyNative mode. Due to the interpreted execution of dynamic graphs, it is recommended to use dynamic graph mode during script development and network process debugging. If you need to manually control the framework to use PyNative mode, you can configure it with the following code: ```python diff --git a/tutorials/source_en/dataset/cache.md b/tutorials/source_en/dataset/cache.md index 01c05337cc..9ed6d7c3ce 100644 --- a/tutorials/source_en/dataset/cache.md +++ b/tutorials/source_en/dataset/cache.md @@ -199,7 +199,7 @@ The directory structure of the extracted dataset file is as follows: #### Caching the Original Dataset Data -Cache the original dataset, and the datat is loaded by the MindSpore system. +Cache the original dataset, and the data is loaded by the MindSpore system. ```python dataset_dir = "./datasets/cifar-10-batches-bin/train" @@ -356,7 +356,7 @@ During the single-node multi-device distributed training, the cache operation al 3. Pass the cache session id to the training script. - Continue to write the Shell script and add the following command to pass `session_id` and other parameters when the Python training is started: + Continue writing the Shell script and add the following command to pass `session_id` and other parameters when the Python training is started: ```bash # make the session_id available to the python scripts @@ -458,7 +458,7 @@ However, access to NFS datasets is often expensive, resulting in longer training In order to improve the training performance of the NFS dataset, we can choose to use a cache service to cache the dataset in memory as Tensor. Once cached, post-sequence epochs can read data directly from memory, avoiding the overhead of accessing remote NAS. -It should be noted that in the data processing process of the training process, and the dataset usually needs to be **augmentated** with randomness after being **read**, such as `RandomCropDecodeResize`. If the cache is added to the operation with randomness, it will cause the results of the first enhancement operation to be cached, and the results read from the cache server in the later sequence are the first cached data, resulting in the loss of data randomness and affecting the accuracy of the training network. +It should be noted that in the data processing process of the training process, and the dataset usually needs to be **augmented** with randomness after being **read**, such as `RandomCropDecodeResize`. If the cache is added to the operation with randomness, it will cause the results of the first enhancement operation to be cached, and the results read from the cache server in the later sequence are the first cached data, resulting in the loss of data randomness and affecting the accuracy of the training network. Therefore, we can choose to add a cache directly after the data set **reads** the operation. This section takes this approach, using the MobileNetV2 network as a sample for an example. @@ -594,7 +594,7 @@ For complete sample code, refer to ModelZoo's [MobileNetV2](https://gitee.com/mi ## Cache Performance Tuning -The cache service performance can be **significantly improved** in following scenarios: +The cache service performance can be **significantly improved** in the following scenarios: - Cache the data processed by augmentation, especially when the data processing pipeline contains high complexity operations such as decode. In this scenario, you do not need to perform the data augmentation operation repeatedly on each epoch, which saves a lot of time. - Use cache services during simple network training and inference. Compared with complex networks, simple networks require less training time. Therefore, the time performance is significantly improved when cache services are used in this scenario. diff --git a/tutorials/source_en/dataset/eager.md b/tutorials/source_en/dataset/eager.md index cb401dfc64..9d371698c6 100644 --- a/tutorials/source_en/dataset/eager.md +++ b/tutorials/source_en/dataset/eager.md @@ -353,7 +353,7 @@ import mindspore.dataset.audio as audio ds.config.set_seed(5) -# cication: LibriSpeech http://www.openslr.org/12 +# citation: LibriSpeech http://www.openslr.org/12 url = "https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/notebook/datasets/84-121123-0000.wav" download(url, './84-121123-0000.wav', replace=True) wav_file = "84-121123-0000.wav" diff --git a/tutorials/source_en/dataset/optimize.ipynb b/tutorials/source_en/dataset/optimize.ipynb index cad626fe5a..124eaa5c04 100644 --- a/tutorials/source_en/dataset/optimize.ipynb +++ b/tutorials/source_en/dataset/optimize.ipynb @@ -302,8 +302,8 @@ "source": [ "## Optimizing the Shuffle Performance\n", "\n", - "The shuffle operation is used to shuffle ordered datasets or repeated datasets. MindSpore provides the `shuffle` function for users which is based by memory cache. A larger value of `buffer_size` indicates a higher shuffling degree, consuming more computing resources and more time. This API allows users to shuffle the data at any time during the entire pipeline process. For the detailed contents, refer to [shuffle processing](https://www.mindspore.cn/docs/en/master/api_python/dataset/dataset_method/operation/mindspore.dataset.Dataset.shuffle.html#mindspore-dataset-dataset-shuffle).\n", - "Because it's based by memory cache, the performance of this method is not as good as that of setting the parameter `shuffle=True`(default: True) of dataset interface to shuffle data directly. For details, see [Built-in Dataset Loading Interfaces](https://www.mindspore.cn/docs/en/master/api_python/mindspore.dataset.html).\n", + "The shuffle operation is used to shuffle ordered datasets or repeated datasets. MindSpore provides the `shuffle` function for users which is based on memory cache. A larger value of `buffer_size` indicates a higher shuffling degree, consuming more computing resources and more time. This API allows users to shuffle the data at any time during the entire pipeline process. For the detailed contents, refer to [shuffle processing](https://www.mindspore.cn/docs/en/master/api_python/dataset/dataset_method/operation/mindspore.dataset.Dataset.shuffle.html#mindspore-dataset-dataset-shuffle).\n", + "Because it's based on memory cache, the performance of this method is not as good as that of setting the parameter `shuffle=True`(default: True) of dataset interface to shuffle data directly. For details, see [Built-in Dataset Loading Interfaces](https://www.mindspore.cn/docs/en/master/api_python/mindspore.dataset.html).\n", "\n", "Shuffle optimization suggestion:" ] @@ -697,7 +697,7 @@ "When using MindSpore for standalone or distributed training, the setting of the parameter `num_parallel_workers` should follow the following principles:\n", "\n", "- The summary of the parameter `num_parallel_workers` set for each data loading and processing operation should not be greater than the maximum number of CPU cores of the machine, otherwise it will cause resource competition between each operation.\n", - "- Before setting the num_parallel_workers parameter, it is recommended to use MindSpore's Profiler (performance analysis) tool to analyze the performance of each operation in the training, and allocate more resources to the operation with pool performance, that is, set a large num_parallel_workers to balance the throughput between various operations and avoid unnecessary waiting.\n", + "- Before setting the num_parallel_workers parameter, it is recommended to use MindSpore's Profiler (performance analysis) tool to analyze the performance of each operation in the training, and allocate more resources to the operation with poor performance, that is, set a large num_parallel_workers to balance the throughput between various operations and avoid unnecessary waiting.\n", "- In a standalone training scenario, increasing the num_parallel_workers parameter can often directly improve processing performance, but in a distributed scenario, due to increased CPU competition, blindly increasing num_parallel_workers may lead to performance degradation. You need to try to use a compromise value." ] }, diff --git a/tutorials/source_en/dataset/python_objects.md b/tutorials/source_en/dataset/python_objects.md index ece66510c2..4647b500b2 100644 --- a/tutorials/source_en/dataset/python_objects.md +++ b/tutorials/source_en/dataset/python_objects.md @@ -89,7 +89,7 @@ When `batch` operation is invoked on a dataset with a column containing dictiona The result of the `batch` operation (for that column) will be one dictionary where all values are NumPy arrays. If such conversion results in an array of type `np.object_`, due to limitations on the model training side, an error message will be shown to the user and the Dataset pipeline terminates. -The following is a example demonstrating when dictionary object exists in dataset pipeline, how it batches the data of "power" key. +The following is an example demonstrating when dictionary object exists in dataset pipeline, how it batches the data of "power" key. ```python import numpy as np @@ -172,7 +172,7 @@ Output: Directly iterating through the dataset object can obtain dictionary type data. When using an iterator to retrieve data, the data processing pipeline will attempt to convert all values inside `dict` objects to Tensor type (if `output_numpy` is set to `True`, it will be converted to NumPy arrays). -Note that this step will be applied recursively to all values inside nested dictionaries as well as all elements inside lists and tuples. For those types that cannot be converted to Tensor/NumPy arrays (such as class objects), they will be passed directly to model. If model can not recognize these types, error will be raised. +Note that this step will be applied recursively to all values inside nested dictionaries as well as all elements inside lists and tuples. For those types that cannot be converted to Tensor/NumPy arrays (such as class objects), they will be passed directly to model. If model can not recognize these types, an error will be raised. Here is an example shows how to acquire `dict` data from pipeline. @@ -236,7 +236,7 @@ In the model training/inference scenario, there are the following constraints wh def dict_to_tuple(d): return tuple([i for i in d.values()]) - # flatten the dict object bedfore it passed into network + # flatten the dict object before it passed into network data = data.map(dict_to_tuple, input_columns=['col1'], output_columns=['my_data', 'my_data2']) print('>>> get data in sequence type') diff --git a/tutorials/source_en/dataset/record.ipynb b/tutorials/source_en/dataset/record.ipynb index 54e94f419e..12047260fc 100644 --- a/tutorials/source_en/dataset/record.ipynb +++ b/tutorials/source_en/dataset/record.ipynb @@ -5,7 +5,7 @@ "id": "f6392a05", "metadata": {}, "source": [ - "# MindRecord Format Cnversion\n", + "# MindRecord Format Conversion\n", "\n", "[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/dataset/record.ipynb)\n" ] @@ -67,7 +67,7 @@ "source": [ "## Converting Dataset to Record Format\n", "\n", - "The following mainly describes how to convert CV class data and NLP class data to MindSpore Record file format, and read MindSpore Record file through the `MindDataset` interface.\n", + "The following mainly describes how to convert CV class data and NLP class data to MindSpore Record file format and how to read MindSpore Record file through the `MindDataset` interface.\n", "\n", "### Converting CV Class Data\n", "\n", diff --git a/tutorials/source_en/dataset/sampler.md b/tutorials/source_en/dataset/sampler.md index 6b37d6a306..d1e6bf40b7 100644 --- a/tutorials/source_en/dataset/sampler.md +++ b/tutorials/source_en/dataset/sampler.md @@ -49,7 +49,7 @@ for data in dataset: docs/mindspore/source_en/features/dataset/overview.mdses where random access are expensive or forbidden. -For example, when access a dataset with `iter(dataset)`, it should return a stream of data from a database or a remote server. +For example, when accessing a dataset with `iter(dataset)`, it should return a stream of data from a database or a remote server. The following constructs a simple iterator and loads it into `GeneratorDataset`. @@ -164,7 +164,7 @@ path = download(url, "./", kind="zip", replace=True) train_dataset = MnistDataset("MNIST_Data/train", shuffle=False) print(type(train_dataset)) -# visialize dataset content +# visualize dataset content figure = plt.figure(figsize=(4, 4)) cols, rows = 3, 3 diff --git a/tutorials/source_zh_cn/dataset/eager.ipynb b/tutorials/source_zh_cn/dataset/eager.ipynb index 79f3d78adb..49a060a0a8 100644 --- a/tutorials/source_zh_cn/dataset/eager.ipynb +++ b/tutorials/source_zh_cn/dataset/eager.ipynb @@ -542,7 +542,7 @@ "\n", "ds.config.set_seed(5)\n", "\n", - "# cication: LibriSpeech http://www.openslr.org/12\n", + "# citation: LibriSpeech http://www.openslr.org/12\n", "url = \"https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/notebook/datasets/84-121123-0000.wav\"\n", "download(url, './84-121123-0000.wav', replace=True)\n", "wav_file = \"84-121123-0000.wav\"\n", diff --git a/tutorials/source_zh_cn/dataset/python_objects.ipynb b/tutorials/source_zh_cn/dataset/python_objects.ipynb index 8d8e5e0fc9..d016d361ec 100644 --- a/tutorials/source_zh_cn/dataset/python_objects.ipynb +++ b/tutorials/source_zh_cn/dataset/python_objects.ipynb @@ -339,7 +339,7 @@ "def dict_to_tuple(d):\n", " return tuple([i for i in d.values()])\n", "\n", - "# flatten the dict object bedfore it passed into network\n", + "# flatten the dict object before it passed into network\n", "data = data.map(dict_to_tuple, input_columns=['col1'], output_columns=['my_data', 'my_data2'])\n", "\n", "print('>>> get data in sequence type')\n", diff --git a/tutorials/source_zh_cn/dataset/sampler.ipynb b/tutorials/source_zh_cn/dataset/sampler.ipynb index d122138b18..acc276d314 100644 --- a/tutorials/source_zh_cn/dataset/sampler.ipynb +++ b/tutorials/source_zh_cn/dataset/sampler.ipynb @@ -224,7 +224,7 @@ "train_dataset = MnistDataset(\"MNIST_Data/train\", shuffle=False)\n", "print(type(train_dataset))\n", "\n", - "# visialize dataset content\n", + "# visualize dataset content\n", "figure = plt.figure(figsize=(4, 4))\n", "cols, rows = 3, 3\n", "\n", -- Gitee