diff --git a/fastSum/Dataloader/Readme_en.md b/fastSum/Dataloader/Readme_en.md new file mode 100644 index 0000000000000000000000000000000000000000..163bb0360dc0da44534710e022554114e033a085 --- /dev/null +++ b/fastSum/Dataloader/Readme_en.md @@ -0,0 +1,6 @@ +# Tips + +1. please install the latest FastNLP: pip install git+https://gitee.com/fastnlp/fastNLP@dev +2. Specify FASTNLP_CACHE_DIR in the system environment variable, which will be the data set download location. +3. example.py is a simple example. + diff --git a/fastSum/Dataloader/Readme_zh.md b/fastSum/Dataloader/Readme_zh.md new file mode 100644 index 0000000000000000000000000000000000000000..88722a5ac97b4ade6f2fe69a8b8f7160dbb21077 --- /dev/null +++ b/fastSum/Dataloader/Readme_zh.md @@ -0,0 +1,6 @@ +# 使用提醒 + +1. 使用前请安装最新的fastNLP,安装方式:pip install git+https://gitee.com/fastnlp/fastNLP@dev +2. 在系统环境变量中指定FASTNLP_CACHE_DIR的位置,为数据集下载位置 +3. 使用方法可参照example.py + diff --git a/fastSum/Dataloader/example.py b/fastSum/Dataloader/example.py new file mode 100644 index 0000000000000000000000000000000000000000..675aa93e9b84729136f668d69bc68c7fe242f543 --- /dev/null +++ b/fastSum/Dataloader/example.py @@ -0,0 +1,11 @@ +from fastNLP.io.file_utils import get_cache_path +from summarizationLoader import ArxivLoader + +if __name__ == "__main__": + + # 请设置fastNLP默认cache的存放路径FASTNLP_CACHE_DIR, get_cache_path会获取设置下载的数据位置 + # 详细可参考: https://gitee.com/fastnlp/fastNLP/blob/7b4e099c5267efb6a4a88b9d789a0940be05bb56/fastNLP/io/file_utils.py#L228 + print(f"下载的数据位置: {get_cache_path()}") + ArxivLoader().download() + data = ArxivLoader().load() + print(data) \ No newline at end of file diff --git a/fastSum/Dataloader/summarizationLoader.py b/fastSum/Dataloader/summarizationLoader.py index 65d69763b52b08a74bab3cc6642a93e7ddcd74cf..fcfc9db4c62a61ce71bb6fdf1a734593e2c36064 100644 --- a/fastSum/Dataloader/summarizationLoader.py +++ b/fastSum/Dataloader/summarizationLoader.py @@ -8,10 +8,9 @@ from fastNLP.io.data_bundle import DataBundle from fastNLP.core.const import Const from fastNLP.io.file_utils import get_cache_path, _get_dataset_url, cached_path - DATASET_DIR = { # Summarization - 'ami': "AMI.zip", + "ami": "AMI.zip", "arxiv": "Arxiv.zip", "billsum": "BillSum.zip", "cnndm": "CNNDM.zip", @@ -21,7 +20,7 @@ DATASET_DIR = { "reddit tifu": "Reddit TIFU.zip", "samsum": "SAMSum.zip", "wikihow": "WikiHow.zip", - "xsum": "Xsum.zip" + "xsum": "Xsum.zip", } @@ -34,11 +33,7 @@ class SumLoader(JsonLoader): def __init__(self, fields: Optional[Dict[str, str]] = None): if fields is None: - fields = { - 'text': 'text', - 'summary': 'summary', - 'label': Const.TARGET - } + fields = {"text": "text", "summary": "summary", "label": Const.TARGET} super(SumLoader, self).__init__(fields=fields) def load(self, paths: Optional[Path] = None) -> DataBundle: @@ -47,7 +42,9 @@ class SumLoader(JsonLoader): def download(self): default_cache_path = get_cache_path() url = _get_dataset_url(self.DATASET_NAME, DATASET_DIR) - output_dir = cached_path(url_or_filename=url, cache_dir=default_cache_path, name='dataset') + output_dir = cached_path( + url_or_filename=url, cache_dir=default_cache_path, name="dataset" + ) # https://gitee.com/fastnlp/fastNLP/blob/7b4e099c5267efb6a4a88b9d789a0940be05bb56/fastNLP/io/file_utils.py#L201 # 如果只有一个文件, get_filepath 返回 filepath + filename # os.path.dirname 反向处理 @@ -76,11 +73,13 @@ class CNNDMLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'CNNDM.train.label.jsonl')): - raise FileNotFoundError(f"CNNDM.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'CNNDM.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'CNNDM.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'CNNDM.test.label.jsonl') + if not os.path.isfile(os.path.join(paths, "CNNDM.train.label.jsonl")): + raise FileNotFoundError( + f"CNNDM.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "CNNDM.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "CNNDM.valid.label.jsonl") + _paths["test"] = os.path.join(paths, "CNNDM.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -110,11 +109,13 @@ class ArxivLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'arxiv.train.label.jsonl')): - raise FileNotFoundError(f"arxiv.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'arxiv.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'arxiv.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'arxiv.test.label.jsonl') + if not os.path.isfile(os.path.join(paths, "arxiv.train.label.jsonl")): + raise FileNotFoundError( + f"arxiv.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "arxiv.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "arxiv.valid.label.jsonl") + _paths["test"] = os.path.join(paths, "arxiv.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -144,11 +145,15 @@ class BillSumLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'billsum_us.train.label.jsonl')): - raise FileNotFoundError(f"billsum_us.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'billsum_us.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'billsum_ca.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'billsum_us.test.label.jsonl') + if not os.path.isfile( + os.path.join(paths, "billsum_us.train.label.jsonl") + ): + raise FileNotFoundError( + f"billsum_us.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "billsum_us.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "billsum_ca.valid.label.jsonl") + _paths["test"] = os.path.join(paths, "billsum_us.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -178,11 +183,15 @@ class MultiNewsLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'multinews.train.label.jsonl')): - raise FileNotFoundError(f"multinews.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'multinews.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'multinews.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'multinews.test.label.jsonl') + if not os.path.isfile( + os.path.join(paths, "multinews.train.label.jsonl") + ): + raise FileNotFoundError( + f"multinews.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "multinews.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "multinews.valid.label.jsonl") + _paths["test"] = os.path.join(paths, "multinews.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -212,11 +221,13 @@ class PubmedLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'pubmed.train.label.jsonl')): - raise FileNotFoundError(f"pubmed.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'pubmed.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'pubmed.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'pubmed.test.label.jsonl') + if not os.path.isfile(os.path.join(paths, "pubmed.train.label.jsonl")): + raise FileNotFoundError( + f"pubmed.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "pubmed.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "pubmed.valid.label.jsonl") + _paths["test"] = os.path.join(paths, "pubmed.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -246,11 +257,13 @@ class SAMSumLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'SAMSum.train.label.jsonl')): - raise FileNotFoundError(f"SAMSum.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'SAMSum.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'SAMSum.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'SAMSum.test.label.jsonl') + if not os.path.isfile(os.path.join(paths, "SAMSum.train.label.jsonl")): + raise FileNotFoundError( + f"SAMSum.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "SAMSum.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "SAMSum.valid.label.jsonl") + _paths["test"] = os.path.join(paths, "SAMSum.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -280,11 +293,13 @@ class WikiHowLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'wikihow.train.label.jsonl')): - raise FileNotFoundError(f"wikihow.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'wikihow.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'wikihow.val.label.jsonl') - _paths['test'] = os.path.join(paths, 'wikihow.test.label.jsonl') + if not os.path.isfile(os.path.join(paths, "wikihow.train.label.jsonl")): + raise FileNotFoundError( + f"wikihow.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "wikihow.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "wikihow.val.label.jsonl") + _paths["test"] = os.path.join(paths, "wikihow.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -314,11 +329,13 @@ class XsumLoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'xsum.train.label.jsonl')): - raise FileNotFoundError(f"xsum.train.label.jsonl is not found in {paths}") - _paths['train'] = os.path.join(paths, 'xsum.train.label.jsonl') - _paths['dev'] = os.path.join(paths, 'xsum.valid.label.jsonl') - _paths['test'] = os.path.join(paths, 'xsum.test.label.jsonl') + if not os.path.isfile(os.path.join(paths, "xsum.train.label.jsonl")): + raise FileNotFoundError( + f"xsum.train.label.jsonl is not found in {paths}" + ) + _paths["train"] = os.path.join(paths, "xsum.train.label.jsonl") + _paths["dev"] = os.path.join(paths, "xsum.valid.label.jsonl") + _paths["test"] = os.path.join(paths, "xsum.test.label.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -352,19 +369,33 @@ class RedditTIFULoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, f"tifu_{self.tag}.all.label.jsonl")): - raise FileNotFoundError(f"tifu_{self.tag}.all.label.jsonl is not found in {paths}") - - _split_set(f"tifu_{self.tag}.all.label", paths, split_name1="middev", split_name2="train", - ratio=self.valid_ratio + self.test_ratio) + if not os.path.isfile( + os.path.join(paths, f"tifu_{self.tag}.all.label.jsonl") + ): + raise FileNotFoundError( + f"tifu_{self.tag}.all.label.jsonl is not found in {paths}" + ) + + _split_set( + f"tifu_{self.tag}.all.label", + paths, + split_name1="middev", + split_name2="train", + ratio=self.valid_ratio + self.test_ratio, + ) if self.valid_ratio + self.test_ratio > 0: - _split_set('middev', paths, split_name1="test", split_name2="dev", - ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) - _paths['train'] = os.path.join(paths, 'train.jsonl') + _split_set( + "middev", + paths, + split_name1="test", + split_name2="dev", + ratio=self.test_ratio / (self.valid_ratio + self.test_ratio), + ) + _paths["train"] = os.path.join(paths, "train.jsonl") if self.valid_ratio > 0: - _paths['dev'] = os.path.join(paths, 'dev.jsonl') + _paths["dev"] = os.path.join(paths, "dev.jsonl") if self.test_ratio > 0: - _paths['test'] = os.path.join(paths, 'test.jsonl') + _paths["test"] = os.path.join(paths, "test.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -387,8 +418,8 @@ class AMILoader(SumLoader): def __init__(self, valid_ratio=0.05, test_ratio=0.05): # AMI 没有 label fields = { - 'text': 'text', - 'summary': 'summary', + "text": "text", + "summary": "summary", } super(AMILoader, self).__init__(fields) @@ -402,19 +433,29 @@ class AMILoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'AMI.jsonl')): + if not os.path.isfile(os.path.join(paths, "AMI.jsonl")): raise FileNotFoundError(f"AMI.jsonl is not found in {paths}") - _split_set('AMI', paths, split_name1="middev", split_name2="train", - ratio=self.valid_ratio + self.test_ratio) + _split_set( + "AMI", + paths, + split_name1="middev", + split_name2="train", + ratio=self.valid_ratio + self.test_ratio, + ) if self.valid_ratio + self.test_ratio > 0: - _split_set('middev', paths, split_name1="test", split_name2="dev", - ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) - _paths['train'] = os.path.join(paths, 'train.jsonl') + _split_set( + "middev", + paths, + split_name1="test", + split_name2="dev", + ratio=self.test_ratio / (self.valid_ratio + self.test_ratio), + ) + _paths["train"] = os.path.join(paths, "train.jsonl") if self.valid_ratio > 0: - _paths['dev'] = os.path.join(paths, 'dev.jsonl') + _paths["dev"] = os.path.join(paths, "dev.jsonl") if self.test_ratio > 0: - _paths['test'] = os.path.join(paths, 'test.jsonl') + _paths["test"] = os.path.join(paths, "test.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -437,8 +478,8 @@ class ICSILoader(SumLoader): def __init__(self, valid_ratio=0.05, test_ratio=0.05): # ICSI 没有 label fields = { - 'text': 'text', - 'summary': 'summary', + "text": "text", + "summary": "summary", } super(ICSILoader, self).__init__(fields) self.valid_ratio = valid_ratio @@ -451,19 +492,29 @@ class ICSILoader(SumLoader): _paths = {} if paths: if os.path.isdir(paths): - if not os.path.isfile(os.path.join(paths, 'ICSI.jsonl')): + if not os.path.isfile(os.path.join(paths, "ICSI.jsonl")): raise FileNotFoundError(f"ICSI.jsonl is not found in {paths}") - _split_set('ICSI', paths, split_name1="middev", split_name2="train", - ratio=self.valid_ratio + self.test_ratio) + _split_set( + "ICSI", + paths, + split_name1="middev", + split_name2="train", + ratio=self.valid_ratio + self.test_ratio, + ) if self.valid_ratio + self.test_ratio > 0: - _split_set('middev', paths, split_name1="test", split_name2="dev", - ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) - _paths['train'] = os.path.join(paths, 'train.jsonl') + _split_set( + "middev", + paths, + split_name1="test", + split_name2="dev", + ratio=self.test_ratio / (self.valid_ratio + self.test_ratio), + ) + _paths["train"] = os.path.join(paths, "train.jsonl") if self.valid_ratio > 0: - _paths['dev'] = os.path.join(paths, 'dev.jsonl') + _paths["dev"] = os.path.join(paths, "dev.jsonl") if self.test_ratio > 0: - _paths['test'] = os.path.join(paths, 'test.jsonl') + _paths["test"] = os.path.join(paths, "test.jsonl") paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") @@ -473,19 +524,39 @@ class ICSILoader(SumLoader): return data_bundle -def _split_set(dataset_name, data_dir, split_name1="dev", split_name2="train", ratio=0.0, suffix='jsonl', keep_orig: bool = True): +def _split_set( + dataset_name, + data_dir, + split_name1="dev", + split_name2="train", + ratio=0.0, + suffix="jsonl", + keep_orig: bool = True, +): if ratio == 0: - os.renames(os.path.join(data_dir, f'{dataset_name}.{suffix}'), - os.path.join(data_dir, f'{split_name2}.{suffix}')) + os.renames( + os.path.join(data_dir, f"{dataset_name}.{suffix}"), + os.path.join(data_dir, f"{split_name2}.{suffix}"), + ) return data_dir - if not os.path.exists(os.path.join(data_dir, f'{split_name1}.{suffix}')): + if not os.path.exists(os.path.join(data_dir, f"{split_name1}.{suffix}")): if ratio > 0: assert 0 < ratio < 1, "dev_ratio should be in range (0,1)." try: - with open(os.path.join(data_dir, f'{dataset_name}.{suffix}'), 'r', encoding='utf-8') as f, \ - open(os.path.join(data_dir, f'middle_file.{suffix}'), 'w', encoding='utf-8') as f1, \ - open(os.path.join(data_dir, f'{split_name1}.{suffix}'), 'w', encoding='utf-8') as f2: + with open( + os.path.join(data_dir, f"{dataset_name}.{suffix}"), + "r", + encoding="utf-8", + ) as f, open( + os.path.join(data_dir, f"middle_file.{suffix}"), + "w", + encoding="utf-8", + ) as f1, open( + os.path.join(data_dir, f"{split_name1}.{suffix}"), + "w", + encoding="utf-8", + ) as f2: for line in f: if random.random() < ratio: f2.write(line) @@ -494,11 +565,13 @@ def _split_set(dataset_name, data_dir, split_name1="dev", split_name2="train", r if keep_orig: assert split_name1 != dataset_name and split_name2 != dataset_name else: - os.remove(os.path.join(data_dir, f'{dataset_name}.{suffix}')) - os.renames(os.path.join(data_dir, f'middle_file.{suffix}'), - os.path.join(data_dir, f'{split_name2}.{suffix}')) + os.remove(os.path.join(data_dir, f"{dataset_name}.{suffix}")) + os.renames( + os.path.join(data_dir, f"middle_file.{suffix}"), + os.path.join(data_dir, f"{split_name2}.{suffix}"), + ) finally: - if os.path.exists(os.path.join(data_dir, f'middle_file.{suffix}')): - os.remove(os.path.join(data_dir, f'middle_file.{suffix}')) + if os.path.exists(os.path.join(data_dir, f"middle_file.{suffix}")): + os.remove(os.path.join(data_dir, f"middle_file.{suffix}")) - return data_dir + return data_dir \ No newline at end of file