diff --git a/fastSum/Dataloader/Readme_en.md b/fastSum/Dataloader/Readme_en.md new file mode 100644 index 0000000000000000000000000000000000000000..163bb0360dc0da44534710e022554114e033a085 --- /dev/null +++ b/fastSum/Dataloader/Readme_en.md @@ -0,0 +1,6 @@ +# Tips + +1. please install the latest FastNLP: pip install git+https://gitee.com/fastnlp/fastNLP@dev +2. Specify FASTNLP_CACHE_DIR in the system environment variable, which will be the data set download location. +3. example.py is a simple example. + diff --git a/fastSum/Dataloader/Readme_zh.md b/fastSum/Dataloader/Readme_zh.md new file mode 100644 index 0000000000000000000000000000000000000000..88722a5ac97b4ade6f2fe69a8b8f7160dbb21077 --- /dev/null +++ b/fastSum/Dataloader/Readme_zh.md @@ -0,0 +1,6 @@ +# 使用提醒 + +1. 使用前请安装最新的fastNLP,安装方式:pip install git+https://gitee.com/fastnlp/fastNLP@dev +2. 在系统环境变量中指定FASTNLP_CACHE_DIR的位置,为数据集下载位置 +3. 使用方法可参照example.py + diff --git a/fastSum/Dataloader/__pycache__/summarizationLoader.cpython-37.pyc b/fastSum/Dataloader/__pycache__/summarizationLoader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..558ec94158cfa67c94c03f44931251862ec6f9c4 Binary files /dev/null and b/fastSum/Dataloader/__pycache__/summarizationLoader.cpython-37.pyc differ diff --git a/fastSum/Dataloader/example.py b/fastSum/Dataloader/example.py new file mode 100644 index 0000000000000000000000000000000000000000..7c0d190713c614f81afac5f08ff05c09865b0511 --- /dev/null +++ b/fastSum/Dataloader/example.py @@ -0,0 +1,11 @@ +from fastNLP.io.file_utils import get_cache_path +from summarizationLoader import ArxivLoader + +if __name__ == '__main__': + + # 请设置fastNLP默认cache的存放路径FASTNLP_CACHE_DIR, get_cache_path会获取设置下载的数据位置 + # 详细可参考: https://gitee.com/fastnlp/fastNLP/blob/7b4e099c5267efb6a4a88b9d789a0940be05bb56/fastNLP/io/file_utils.py#L228 + print(f'下载的数据位置: {get_cache_path()}') + ArxivLoader().download() + data = ArxivLoader().load() + print(data) \ No newline at end of file diff --git a/fastSum/Dataloader/summarizationLoader.py b/fastSum/Dataloader/summarizationLoader.py index 65d69763b52b08a74bab3cc6642a93e7ddcd74cf..c8133995d425b28b815f4e15b829e5889e42e370 100644 --- a/fastSum/Dataloader/summarizationLoader.py +++ b/fastSum/Dataloader/summarizationLoader.py @@ -8,7 +8,6 @@ from fastNLP.io.data_bundle import DataBundle from fastNLP.core.const import Const from fastNLP.io.file_utils import get_cache_path, _get_dataset_url, cached_path - DATASET_DIR = { # Summarization 'ami': "AMI.zip", @@ -47,7 +46,9 @@ class SumLoader(JsonLoader): def download(self): default_cache_path = get_cache_path() url = _get_dataset_url(self.DATASET_NAME, DATASET_DIR) - output_dir = cached_path(url_or_filename=url, cache_dir=default_cache_path, name='dataset') + output_dir = cached_path(url_or_filename=url, + cache_dir=default_cache_path, + name='dataset') # https://gitee.com/fastnlp/fastNLP/blob/7b4e099c5267efb6a4a88b9d789a0940be05bb56/fastNLP/io/file_utils.py#L201 # 如果只有一个文件, get_filepath 返回 filepath + filename # os.path.dirname 反向处理 @@ -76,9 +77,12 @@ class CNNDMLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'CNNDM.train.label.jsonl')): - raise FileNotFoundError(f"CNNDM.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'CNNDM.train.label.jsonl') + if not os.path.isfile( + os.path.join(paths, 'CNNDM.train.label.jsonl')): + raise FileNotFoundError( + f"CNNDM.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, + 'CNNDM.train.label.jsonl') _paths['dev'] = os.path.join(paths, 'CNNDM.valid.label.jsonl') _paths['test'] = os.path.join(paths, 'CNNDM.test.label.jsonl') paths = _paths @@ -110,9 +114,12 @@ class ArxivLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'arxiv.train.label.jsonl')): - raise FileNotFoundError(f"arxiv.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'arxiv.train.label.jsonl') + if not os.path.isfile( + os.path.join(paths, 'arxiv.train.label.jsonl')): + raise FileNotFoundError( + f"arxiv.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, + 'arxiv.train.label.jsonl') _paths['dev'] = os.path.join(paths, 'arxiv.valid.label.jsonl') _paths['test'] = os.path.join(paths, 'arxiv.test.label.jsonl') paths = _paths @@ -144,11 +151,17 @@ class BillSumLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'billsum_us.train.label.jsonl')): - raise FileNotFoundError(f"billsum_us.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'billsum_us.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'billsum_ca.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'billsum_us.test.label.jsonl') + if not os.path.isfile( + os.path.join(paths, 'billsum_us.train.label.jsonl')): + raise FileNotFoundError( + f"billsum_us.train.label.jsonl is not found in {paths}" + ) + _paths['train'] = os.path.join(paths, + 'billsum_us.train.label.jsonl') + _paths['dev'] = os.path.join(paths, + 'billsum_ca.valid.label.jsonl') + _paths['test'] = os.path.join(paths, + 'billsum_us.test.label.jsonl') paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -178,11 +191,16 @@ class MultiNewsLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'multinews.train.label.jsonl')): - raise FileNotFoundError(f"multinews.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'multinews.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'multinews.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'multinews.test.label.jsonl') + if not os.path.isfile( + os.path.join(paths, 'multinews.train.label.jsonl')): + raise FileNotFoundError( + f"multinews.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, + 'multinews.train.label.jsonl') + _paths['dev'] = os.path.join(paths, + 'multinews.valid.label.jsonl') + _paths['test'] = os.path.join(paths, + 'multinews.test.label.jsonl') paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -212,9 +230,12 @@ class PubmedLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'pubmed.train.label.jsonl')): - raise FileNotFoundError(f"pubmed.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'pubmed.train.label.jsonl') + if not os.path.isfile( + os.path.join(paths, 'pubmed.train.label.jsonl')): + raise FileNotFoundError( + f"pubmed.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, + 'pubmed.train.label.jsonl') _paths['dev'] = os.path.join(paths, 'pubmed.valid.label.jsonl') _paths['test'] = os.path.join(paths, 'pubmed.test.label.jsonl') paths = _paths @@ -246,9 +267,12 @@ class SAMSumLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'SAMSum.train.label.jsonl')): - raise FileNotFoundError(f"SAMSum.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'SAMSum.train.label.jsonl') + if not os.path.isfile( + os.path.join(paths, 'SAMSum.train.label.jsonl')): + raise FileNotFoundError( + f"SAMSum.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, + 'SAMSum.train.label.jsonl') _paths['dev'] = os.path.join(paths, 'SAMSum.valid.label.jsonl') _paths['test'] = os.path.join(paths, 'SAMSum.test.label.jsonl') paths = _paths @@ -280,11 +304,15 @@ class WikiHowLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'wikihow.train.label.jsonl')): - raise FileNotFoundError(f"wikihow.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'wikihow.train.label.jsonl') + if not os.path.isfile( + os.path.join(paths, 'wikihow.train.label.jsonl')): + raise FileNotFoundError( + f"wikihow.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, + 'wikihow.train.label.jsonl') _paths['dev'] = os.path.join(paths, 'wikihow.val.label.jsonl') - _paths['test'] = os.path.join(paths, 'wikihow.test.label.jsonl') + _paths['test'] = os.path.join(paths, + 'wikihow.test.label.jsonl') paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -314,8 +342,10 @@ class XsumLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'xsum.train.label.jsonl')): - raise FileNotFoundError(f"xsum.train.label.jsonl is not found in {paths}") + if not os.path.isfile( + os.path.join(paths, 'xsum.train.label.jsonl')): + raise FileNotFoundError( + f"xsum.train.label.jsonl is not found in {paths}") _paths['train'] = os.path.join(paths, 'xsum.train.label.jsonl') _paths['dev'] = os.path.join(paths, 'xsum.valid.label.jsonl') _paths['test'] = os.path.join(paths, 'xsum.test.label.jsonl') @@ -342,7 +372,8 @@ class RedditTIFULoader(SumLoader): super(RedditTIFULoader, self).__init__() self.valid_ratio = valid_ratio self.test_ratio = test_ratio - assert tag in ["long", "short"], "tag not valid (neither long nor short)!" + assert tag in ["long", + "short"], "tag not valid (neither long nor short)!" self.tag = tag def load(self, paths: Optional[Path] = None) -> DataBundle: @@ -352,14 +383,25 @@ class RedditTIFULoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, f"tifu_{self.tag}.all.label.jsonl")): - raise FileNotFoundError(f"tifu_{self.tag}.all.label.jsonl is not found in {paths}") - - _split_set(f"tifu_{self.tag}.all.label", paths, split_name1="middev", split_name2="train", + if not os.path.isfile( + os.path.join(paths, + f"tifu_{self.tag}.all.label.jsonl")): + raise FileNotFoundError( + f"tifu_{self.tag}.all.label.jsonl is not found in {paths}" + ) + + _split_set(f"tifu_{self.tag}.all.label", + paths, + split_name1="middev", + split_name2="train", ratio=self.valid_ratio + self.test_ratio) if self.valid_ratio + self.test_ratio > 0: - _split_set('middev', paths, split_name1="test", split_name2="dev", - ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) + _split_set('middev', + paths, + split_name1="test", + split_name2="dev", + ratio=self.test_ratio / + (self.valid_ratio + self.test_ratio)) _paths['train'] = os.path.join(paths, 'train.jsonl') if self.valid_ratio > 0: _paths['dev'] = os.path.join(paths, 'dev.jsonl') @@ -403,13 +445,21 @@ class AMILoader(SumLoader): if paths: if os.path.isdir(paths): if not os.path.isfile(os.path.join(paths, 'AMI.jsonl')): - raise FileNotFoundError(f"AMI.jsonl is not found in {paths}") + raise FileNotFoundError( + f"AMI.jsonl is not found in {paths}") - _split_set('AMI', paths, split_name1="middev", split_name2="train", + _split_set('AMI', + paths, + split_name1="middev", + split_name2="train", ratio=self.valid_ratio + self.test_ratio) if self.valid_ratio + self.test_ratio > 0: - _split_set('middev', paths, split_name1="test", split_name2="dev", - ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) + _split_set('middev', + paths, + split_name1="test", + split_name2="dev", + ratio=self.test_ratio / + (self.valid_ratio + self.test_ratio)) _paths['train'] = os.path.join(paths, 'train.jsonl') if self.valid_ratio > 0: _paths['dev'] = os.path.join(paths, 'dev.jsonl') @@ -452,13 +502,21 @@ class ICSILoader(SumLoader): if paths: if os.path.isdir(paths): if not os.path.isfile(os.path.join(paths, 'ICSI.jsonl')): - raise FileNotFoundError(f"ICSI.jsonl is not found in {paths}") + raise FileNotFoundError( + f"ICSI.jsonl is not found in {paths}") - _split_set('ICSI', paths, split_name1="middev", split_name2="train", + _split_set('ICSI', + paths, + split_name1="middev", + split_name2="train", ratio=self.valid_ratio + self.test_ratio) if self.valid_ratio + self.test_ratio > 0: - _split_set('middev', paths, split_name1="test", split_name2="dev", - ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) + _split_set('middev', + paths, + split_name1="test", + split_name2="dev", + ratio=self.test_ratio / + (self.valid_ratio + self.test_ratio)) _paths['train'] = os.path.join(paths, 'train.jsonl') if self.valid_ratio > 0: _paths['dev'] = os.path.join(paths, 'dev.jsonl') @@ -473,7 +531,13 @@ class ICSILoader(SumLoader): return data_bundle -def _split_set(dataset_name, data_dir, split_name1="dev", split_name2="train", ratio=0.0, suffix='jsonl', keep_orig: bool = True): +def _split_set(dataset_name, + data_dir, + split_name1="dev", + split_name2="train", + ratio=0.0, + suffix='jsonl', + keep_orig: bool = True): if ratio == 0: os.renames(os.path.join(data_dir, f'{dataset_name}.{suffix}'), os.path.join(data_dir, f'{split_name2}.{suffix}')) @@ -494,11 +558,13 @@ def _split_set(dataset_name, data_dir, split_name1="dev", split_name2="train", r if keep_orig: assert split_name1 != dataset_name and split_name2 != dataset_name else: - os.remove(os.path.join(data_dir, f'{dataset_name}.{suffix}')) + os.remove( + os.path.join(data_dir, f'{dataset_name}.{suffix}')) os.renames(os.path.join(data_dir, f'middle_file.{suffix}'), os.path.join(data_dir, f'{split_name2}.{suffix}')) finally: - if os.path.exists(os.path.join(data_dir, f'middle_file.{suffix}')): + if os.path.exists( + os.path.join(data_dir, f'middle_file.{suffix}')): os.remove(os.path.join(data_dir, f'middle_file.{suffix}')) return data_dir