From a1ba7e578ce4b34e80fadfc480e87e92b3ebf93b Mon Sep 17 00:00:00 2001 From: Chijunlong <2869897445@qq.com> Date: Tue, 28 Dec 2021 11:01:04 +0800 Subject: [PATCH] update mindtext.embeddings/index.rst and installation.rst add mindtext.dataset.test1.md --- .../mindtext.dataset.test1.md | 56 ++++++++++++++++++- .../source/apis/mindtext.embeddings/index.rst | 4 -- docs/source/user/installation.rst | 9 +-- 3 files changed, 60 insertions(+), 9 deletions(-) diff --git a/docs/source/apis/mindtext.dataset/mindtext.dataset.test1.md b/docs/source/apis/mindtext.dataset/mindtext.dataset.test1.md index 2cb259c..c886cd1 100644 --- a/docs/source/apis/mindtext.dataset/mindtext.dataset.test1.md +++ b/docs/source/apis/mindtext.dataset/mindtext.dataset.test1.md @@ -1 +1,55 @@ -# mindtext.dataset.test1 \ No newline at end of file +# mindtext.dataset.test1 + +Dataset的构建 += + +Example SST-2数据集Dataset构建 + + from mindtext.dataset.classification import SST2Dataset + + dataset = SST2Dataset(paths='./mindtext/dataset/SST-2', + tokenizer="./mindtext/pretrain/roberta-base", + max_length=128, + truncation_strategy=True, + batch_size=32) + + ds = dataset() + ds = dataset.from_cache( columns_list=['input_ids', 'attention_mask','label'], + test_columns_list=['input_ids', 'attention_mask'], + batch_size=32 + ) + + train_dataset = ds['train'] +mindtext.dataset.base_dataset.Dataset +-- +> class mindtext.dataset.base_dataset.Dataset( vocab (Vocabulary, Optional): Convert tokens to index,default None. +> name (str, Optional): Dataset name,default None. +> label_map (Dict[str, int], Optional): Dataset label map,default None.) + +通过base_dataset中基类Dataset来构建文本分类、文本匹配和生成任务对应的数据集 + +>init(self, vocab: Vocabulary = None, name: str = None, + label_map: Dict[str, int] = None) + +参数 +> + vocab(Vocabulary): 词表,默认为None +> + name(str): 下游任务数据集Dataset名称,默认为None +> + label_map(Dict[str, int], Optional):Dataset标签映射 +> + +mindtext.dataset.base_dataset.CLSBaseDataset +-- + +文本分类Dataset的基类 + +Example +>class SST2Dataset(CLSBaseDataset): + +mindtext.dataset.base_dataset.PairCLSBaseDataset +- + +文本匹配Dataset基类 + +Example + +>class LCQMCDataset(PairCLSBaseDataset): \ No newline at end of file diff --git a/docs/source/apis/mindtext.embeddings/index.rst b/docs/source/apis/mindtext.embeddings/index.rst index 1af9f7c..87dc07d 100644 --- a/docs/source/apis/mindtext.embeddings/index.rst +++ b/docs/source/apis/mindtext.embeddings/index.rst @@ -9,11 +9,7 @@ mindtext.embeddings .. toctree:: :maxdepth: 1 - mindtext.embeddings.bert_embedding mindtext.embeddings.char_embedding mindtext.embeddings.embedding - mindtext.embeddings.luke_embedding - mindtext.embeddings.region_embedding - mindtext.embeddings.roberta_embedding mindtext.embeddings.static_embedding diff --git a/docs/source/user/installation.rst b/docs/source/user/installation.rst index 5a12c6e..663ffd0 100644 --- a/docs/source/user/installation.rst +++ b/docs/source/user/installation.rst @@ -19,10 +19,11 @@ mindtext 依赖如下包:: numpy==1.21.2 mindspore_gpu==1.3.0 -其中PyTorch的安装可能与操作系统及 CUDA 的版本相关,请参见 `PyTorch 官网 `_ 。 -在依赖包安装完成的情况,您可以在命令行执行如下指令完成安装 +其中`mindspore`的安装可能与操作系统及 `CUDA` 的版本相关,请参见 `MindSpore 官网 `_ 。 +在依赖包安装完成的情况,您可以在命令行执行如下指令完成mindtext的安装 .. code:: shell - >>> pip install mindtext - >>> python -m spacy download en + >>> git clone https://gitee.com/mindspore/mindtext.git + >>> cd mindtext + >>> python setup.py install -- Gitee