diff --git a/Baseline/model/__init__.py b/fastSum/Baseline/__init__.py similarity index 100% rename from Baseline/model/__init__.py rename to fastSum/Baseline/__init__.py diff --git a/Baseline/config/deeplstm.config b/fastSum/Baseline/config/deeplstm.config similarity index 100% rename from Baseline/config/deeplstm.config rename to fastSum/Baseline/config/deeplstm.config diff --git a/Baseline/config/seqlab.config b/fastSum/Baseline/config/seqlab.config similarity index 100% rename from Baseline/config/seqlab.config rename to fastSum/Baseline/config/seqlab.config diff --git a/Baseline/config/transformer.config b/fastSum/Baseline/config/transformer.config similarity index 100% rename from Baseline/config/transformer.config rename to fastSum/Baseline/config/transformer.config diff --git a/Baseline/model/DeepLSTM.py b/fastSum/Baseline/model/DeepLSTM.py similarity index 100% rename from Baseline/model/DeepLSTM.py rename to fastSum/Baseline/model/DeepLSTM.py diff --git a/Baseline/model/Encoder.py b/fastSum/Baseline/model/Encoder.py similarity index 100% rename from Baseline/model/Encoder.py rename to fastSum/Baseline/model/Encoder.py diff --git a/Baseline/model/LSTMModel.py b/fastSum/Baseline/model/LSTMModel.py similarity index 100% rename from Baseline/model/LSTMModel.py rename to fastSum/Baseline/model/LSTMModel.py diff --git a/Baseline/model/Loss.py b/fastSum/Baseline/model/Loss.py similarity index 100% rename from Baseline/model/Loss.py rename to fastSum/Baseline/model/Loss.py diff --git a/Baseline/model/Metric.py b/fastSum/Baseline/model/Metric.py similarity index 100% rename from Baseline/model/Metric.py rename to fastSum/Baseline/model/Metric.py diff --git a/Baseline/model/TForiginal.py b/fastSum/Baseline/model/TForiginal.py similarity index 100% rename from Baseline/model/TForiginal.py rename to fastSum/Baseline/model/TForiginal.py diff --git a/Baseline/model/TransformerModel.py b/fastSum/Baseline/model/TransformerModel.py similarity index 100% rename from Baseline/model/TransformerModel.py rename to fastSum/Baseline/model/TransformerModel.py diff --git a/Baseline/test/__init__.py b/fastSum/Baseline/model/__init__.py similarity index 100% rename from Baseline/test/__init__.py rename to fastSum/Baseline/model/__init__.py diff --git a/PointerGen/model/mq b/fastSum/Baseline/test/__init__.py similarity index 100% rename from PointerGen/model/mq rename to fastSum/Baseline/test/__init__.py diff --git a/Baseline/test/test_dataloader.py b/fastSum/Baseline/test/test_dataloader.py similarity index 100% rename from Baseline/test/test_dataloader.py rename to fastSum/Baseline/test/test_dataloader.py diff --git a/Baseline/test/testdata/test.jsonl b/fastSum/Baseline/test/testdata/test.jsonl similarity index 100% rename from Baseline/test/testdata/test.jsonl rename to fastSum/Baseline/test/testdata/test.jsonl diff --git a/Baseline/test/testdata/train.jsonl b/fastSum/Baseline/test/testdata/train.jsonl similarity index 100% rename from Baseline/test/testdata/train.jsonl rename to fastSum/Baseline/test/testdata/train.jsonl diff --git a/Baseline/test/testdata/val.jsonl b/fastSum/Baseline/test/testdata/val.jsonl similarity index 100% rename from Baseline/test/testdata/val.jsonl rename to fastSum/Baseline/test/testdata/val.jsonl diff --git a/Baseline/test/testdata/vocab b/fastSum/Baseline/test/testdata/vocab similarity index 100% rename from Baseline/test/testdata/vocab rename to fastSum/Baseline/test/testdata/vocab diff --git a/Baseline/test_data.py b/fastSum/Baseline/test_data.py similarity index 100% rename from Baseline/test_data.py rename to fastSum/Baseline/test_data.py diff --git a/Baseline/tools/Callback.py b/fastSum/Baseline/tools/Callback.py similarity index 100% rename from Baseline/tools/Callback.py rename to fastSum/Baseline/tools/Callback.py diff --git a/Baseline/tools/PositionEmbedding.py b/fastSum/Baseline/tools/PositionEmbedding.py similarity index 100% rename from Baseline/tools/PositionEmbedding.py rename to fastSum/Baseline/tools/PositionEmbedding.py diff --git a/Baseline/tools/__init__.py b/fastSum/Baseline/tools/__init__.py similarity index 100% rename from Baseline/tools/__init__.py rename to fastSum/Baseline/tools/__init__.py diff --git a/Baseline/tools/data.py b/fastSum/Baseline/tools/data.py similarity index 100% rename from Baseline/tools/data.py rename to fastSum/Baseline/tools/data.py diff --git a/Baseline/tools/logger.py b/fastSum/Baseline/tools/logger.py similarity index 100% rename from Baseline/tools/logger.py rename to fastSum/Baseline/tools/logger.py diff --git a/Baseline/tools/utils.py b/fastSum/Baseline/tools/utils.py similarity index 100% rename from Baseline/tools/utils.py rename to fastSum/Baseline/tools/utils.py diff --git a/Baseline/train.py b/fastSum/Baseline/train.py similarity index 100% rename from Baseline/train.py rename to fastSum/Baseline/train.py diff --git a/Baseline/train_origin.py b/fastSum/Baseline/train_origin.py similarity index 100% rename from Baseline/train_origin.py rename to fastSum/Baseline/train_origin.py diff --git a/Baseline/train_transformer.py b/fastSum/Baseline/train_transformer.py similarity index 100% rename from Baseline/train_transformer.py rename to fastSum/Baseline/train_transformer.py diff --git a/Baseline/transformer/Beam.py b/fastSum/Baseline/transformer/Beam.py similarity index 100% rename from Baseline/transformer/Beam.py rename to fastSum/Baseline/transformer/Beam.py diff --git a/Baseline/transformer/Constants.py b/fastSum/Baseline/transformer/Constants.py similarity index 100% rename from Baseline/transformer/Constants.py rename to fastSum/Baseline/transformer/Constants.py diff --git a/Baseline/transformer/Layers.py b/fastSum/Baseline/transformer/Layers.py similarity index 100% rename from Baseline/transformer/Layers.py rename to fastSum/Baseline/transformer/Layers.py diff --git a/Baseline/transformer/Models.py b/fastSum/Baseline/transformer/Models.py similarity index 100% rename from Baseline/transformer/Models.py rename to fastSum/Baseline/transformer/Models.py diff --git a/Baseline/transformer/Modules.py b/fastSum/Baseline/transformer/Modules.py similarity index 100% rename from Baseline/transformer/Modules.py rename to fastSum/Baseline/transformer/Modules.py diff --git a/Baseline/transformer/Optim.py b/fastSum/Baseline/transformer/Optim.py similarity index 100% rename from Baseline/transformer/Optim.py rename to fastSum/Baseline/transformer/Optim.py diff --git a/Baseline/transformer/SubLayers.py b/fastSum/Baseline/transformer/SubLayers.py similarity index 100% rename from Baseline/transformer/SubLayers.py rename to fastSum/Baseline/transformer/SubLayers.py diff --git a/Baseline/transformer/Translator.py b/fastSum/Baseline/transformer/Translator.py similarity index 100% rename from Baseline/transformer/Translator.py rename to fastSum/Baseline/transformer/Translator.py diff --git a/Baseline/transformer/__init__.py b/fastSum/Baseline/transformer/__init__.py similarity index 100% rename from Baseline/transformer/__init__.py rename to fastSum/Baseline/transformer/__init__.py diff --git a/BertSum/callback.py b/fastSum/BertSum/callback.py similarity index 100% rename from BertSum/callback.py rename to fastSum/BertSum/callback.py diff --git a/BertSum/dataloader.py b/fastSum/BertSum/dataloader.py similarity index 100% rename from BertSum/dataloader.py rename to fastSum/BertSum/dataloader.py diff --git a/BertSum/metrics.py b/fastSum/BertSum/metrics.py similarity index 100% rename from BertSum/metrics.py rename to fastSum/BertSum/metrics.py diff --git a/BertSum/model.py b/fastSum/BertSum/model.py similarity index 100% rename from BertSum/model.py rename to fastSum/BertSum/model.py diff --git a/BertSum/train_BertSum.py b/fastSum/BertSum/train_BertSum.py similarity index 100% rename from BertSum/train_BertSum.py rename to fastSum/BertSum/train_BertSum.py diff --git a/BertSum/utils.py b/fastSum/BertSum/utils.py similarity index 100% rename from BertSum/utils.py rename to fastSum/BertSum/utils.py diff --git a/fastSum/Dataloader/summarizationLoader.py b/fastSum/Dataloader/summarizationLoader.py new file mode 100644 index 0000000000000000000000000000000000000000..54e0e5f77e38cb7cd1c4347ba354872a1ddab602 --- /dev/null +++ b/fastSum/Dataloader/summarizationLoader.py @@ -0,0 +1,462 @@ +import bisect +from time import time +from datetime import timedelta +from typing import Union, Dict + +from fastNLP.io.loader import JsonLoader +from fastNLP.modules.tokenizer import BertTokenizer +from fastNLP.io.data_bundle import DataBundle +from fastNLP.core.const import Const +from fastNLP.io.file_utils import get_cache_path, _get_dataset_url, cached_path +import os +import random + +DATASET_DIR = { + # Summarization + 'ami': "AMI.zip", + "arxiv": "Arxiv.zip", + "billsum": "BillSum.zip", + "cnndm": "CNNDM.zip", + "icsi": "ICSI.zip", + "multi-news": "Multi-News.zip", + "pubmed": "Pubmed.zip", + "reddit tifu": "Reddit TIFU.zip", + "samsum": "SAMSum.zip", + "wikihow": "WikiHow.zip", + "xsum": "Xsum.zip" +} + + +class SumLoader(JsonLoader): + """ + 所有摘要数据集loader的父类 + """ + + def __init__(self): + fields = { + 'text': 'text', + 'summary': 'summary', + 'label': Const.TARGET + } + super(SumLoader, self).__init__(fields=fields) + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + pass + + def download(self, dataset_name): + default_cache_path = get_cache_path() + url = _get_dataset_url(dataset_name, DATASET_DIR) + output_dir = cached_path(url_or_filename=url, cache_dir=default_cache_path, name='dataset') + return output_dir + + +class CNNDMLoader(SumLoader): + ''' + CNNDM数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://www.aclweb.org/anthology/K16-1028/ + ''' + + def __init__(self): + super(CNNDMLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("ami") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'CNNDM.train.label.jsonl')): + raise FileNotFoundError(f"CNNDM.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'CNNDM.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'CNNDM.valid.label.jsonl') + _paths['test'] = os.path.join(paths, 'CNNDM.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class ArxivLoader(SumLoader): + ''' + Arxiv数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1804.05685 + ''' + + def __init__(self): + super(ArxivLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("arxiv") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'arxiv.train.label.jsonl')): + raise FileNotFoundError(f"arxiv.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'arxiv.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'arxiv.valid.label.jsonl') + _paths['test'] = os.path.join(paths, 'arxiv.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class BillSumLoader(SumLoader): + ''' + BillSum数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1910.00523 + ''' + + def __init__(self): + super(BillSumLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("billsum") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'billsum_us.train.label.jsonl')): + raise FileNotFoundError(f"billsum_us.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'billsum_us.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'billsum_ca.valid.label.jsonl') + _paths['test'] = os.path.join(paths, 'billsum_us.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class MultiNewsLoader(SumLoader): + ''' + MultiNews数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1906.01749 + ''' + + def __init__(self): + super(MultiNewsLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("multi-news") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'multinews.train.label.jsonl')): + raise FileNotFoundError(f"multinews.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'multinews.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'multinews.valid.label.jsonl') + _paths['test'] = os.path.join(paths, 'multinews.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class PubmedLoader(SumLoader): + ''' + Pubmed数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1804.05685 + ''' + + def __init__(self): + super(PubmedLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("pubmed") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'pubmed.train.label.jsonl')): + raise FileNotFoundError(f"pubmed.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'pubmed.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'pubmed.valid.label.jsonl') + _paths['test'] = os.path.join(paths, 'pubmed.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class SAMSumLoader(SumLoader): + ''' + SAMSum数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1911.12237 + ''' + + def __init__(self): + super(SAMSumLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("samsum") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'SAMSum.train.label.jsonl')): + raise FileNotFoundError(f"SAMSum.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'SAMSum.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'SAMSum.valid.label.jsonl') + _paths['test'] = os.path.join(paths, 'SAMSum.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class WikiHowLoader(SumLoader): + ''' + WikiHow数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1810.09305 + ''' + + def __init__(self): + super(WikiHowLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("wikihow") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'wikihow.train.label.jsonl')): + raise FileNotFoundError(f"wikihow.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'wikihow.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'wikihow.val.label.jsonl') + _paths['test'] = os.path.join(paths, 'wikihow.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class XsumLoader(SumLoader): + ''' + Xsum数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1808.08745 + ''' + + def __init__(self): + super(XsumLoader, self).__init__() + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("xsum") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'xsum.train.label.jsonl')): + raise FileNotFoundError(f"xsum.train.label.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'xsum.train.label.jsonl') + _paths['dev'] = os.path.join(paths, 'xsum.valid.label.jsonl') + _paths['test'] = os.path.join(paths, 'xsum.test.label.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class RedditTIFULoader(SumLoader): + ''' + Reddit TIFU数据集的loader + 如果您的文章使用了这份数据,请引用 + + https://arxiv.org/abs/1811.00783 + ''' + + def __init__(self, tag, valid_ratio=0.05, test_ratio=0.05): + super(RedditTIFULoader, self).__init__() + self.valid_ratio = valid_ratio + self.test_ratio = test_ratio + assert tag in ["long", "short"], "tag not valid (neither long nor short)!" + self.tag = tag + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("reddit tifu") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, f"tifu_{self.tag}.all.label.jsonl")): + raise FileNotFoundError(f"tifu_{self.tag}.all.label.jsonl is not found in {paths}") + + _split_set(f"tifu_{self.tag}.all.label", paths, split_name1="middev", split_name2="train", + ratio=self.valid_ratio + self.test_ratio) + if self.valid_ratio + self.test_ratio > 0: + _split_set('middev', paths, split_name1="test", split_name2="dev", + ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) + _paths['train'] = os.path.join(paths, 'train.jsonl') + if self.valid_ratio > 0: + _paths['dev'] = os.path.join(paths, 'dev.jsonl') + if self.test_ratio > 0: + _paths['test'] = os.path.join(paths, 'test.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class AMILoader(SumLoader): + ''' + AMI数据集的loader + 如果您的文章使用了这份数据,请引用 + + http://groups.inf.ed.ac.uk/ami/download/ + ''' + + def __init__(self, valid_ratio=0.05, test_ratio=0.05): + super(AMILoader, self).__init__() + self.valid_ratio = valid_ratio + self.test_ratio = test_ratio + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("ami") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'AMI.jsonl')): + raise FileNotFoundError(f"AMI.jsonl is not found in {paths}") + + _split_set('AMI', paths, split_name1="middev", split_name2="train", + ratio=self.valid_ratio + self.test_ratio) + if self.valid_ratio + self.test_ratio > 0: + _split_set('middev', paths, split_name1="test", split_name2="dev", + ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) + _paths['train'] = os.path.join(paths, 'train.jsonl') + if self.valid_ratio > 0: + _paths['dev'] = os.path.join(paths, 'dev.jsonl') + if self.test_ratio > 0: + _paths['test'] = os.path.join(paths, 'test.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +class ICSILoader(SumLoader): + ''' + ICSI数据集的loader + 如果您的文章使用了这份数据,请引用 + + http://groups.inf.ed.ac.uk/ami/icsi/ + ''' + + def __init__(self, valid_ratio=0.05, test_ratio=0.05): + super(ICSILoader, self).__init__() + self.valid_ratio = valid_ratio + self.test_ratio = test_ratio + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download("icsi") + + _paths = {} + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'ICSI.jsonl')): + raise FileNotFoundError(f"ICSI.jsonl is not found in {paths}") + + _split_set('ICSI', paths, split_name1="middev", split_name2="train", + ratio=self.valid_ratio + self.test_ratio) + if self.valid_ratio + self.test_ratio > 0: + _split_set('middev', paths, split_name1="test", split_name2="dev", + ratio=self.test_ratio / (self.valid_ratio + self.test_ratio)) + _paths['train'] = os.path.join(paths, 'train.jsonl') + if self.valid_ratio > 0: + _paths['dev'] = os.path.join(paths, 'dev.jsonl') + if self.test_ratio > 0: + _paths['test'] = os.path.join(paths, 'test.jsonl') + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + +def _split_set(dataset_name, data_dir, split_name1="dev", split_name2="train", ratio=0.0, suffix='jsonl'): + if ratio == 0: + os.renames(os.path.join(data_dir, f'{dataset_name}.{suffix}'), + os.path.join(data_dir, f'{split_name2}.{suffix}')) + return data_dir + + if not os.path.exists(os.path.join(data_dir, f'{split_name1}.{suffix}')): + if ratio > 0: + assert 0 < ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, f'{dataset_name}.{suffix}'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, f'middle_file.{suffix}'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, f'{split_name1}.{suffix}'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, f'{dataset_name}.{suffix}')) + os.renames(os.path.join(data_dir, f'middle_file.{suffix}'), + os.path.join(data_dir, f'{split_name2}.{suffix}')) + finally: + if os.path.exists(os.path.join(data_dir, f'middle_file.{suffix}')): + os.remove(os.path.join(data_dir, f'middle_file.{suffix}')) + + return data_dir diff --git a/MatchSum/README.md b/fastSum/MatchSum/README.md similarity index 100% rename from MatchSum/README.md rename to fastSum/MatchSum/README.md diff --git a/MatchSum/callback.py b/fastSum/MatchSum/callback.py similarity index 100% rename from MatchSum/callback.py rename to fastSum/MatchSum/callback.py diff --git a/MatchSum/dataloader.py b/fastSum/MatchSum/dataloader.py similarity index 100% rename from MatchSum/dataloader.py rename to fastSum/MatchSum/dataloader.py diff --git a/MatchSum/metrics.py b/fastSum/MatchSum/metrics.py similarity index 100% rename from MatchSum/metrics.py rename to fastSum/MatchSum/metrics.py diff --git a/MatchSum/model.py b/fastSum/MatchSum/model.py similarity index 100% rename from MatchSum/model.py rename to fastSum/MatchSum/model.py diff --git a/MatchSum/preprocess/get_candidate.py b/fastSum/MatchSum/preprocess/get_candidate.py similarity index 100% rename from MatchSum/preprocess/get_candidate.py rename to fastSum/MatchSum/preprocess/get_candidate.py diff --git a/MatchSum/preprocess/test_cnndm.jsonl b/fastSum/MatchSum/preprocess/test_cnndm.jsonl similarity index 100% rename from MatchSum/preprocess/test_cnndm.jsonl rename to fastSum/MatchSum/preprocess/test_cnndm.jsonl diff --git a/MatchSum/train_matching.py b/fastSum/MatchSum/train_matching.py similarity index 100% rename from MatchSum/train_matching.py rename to fastSum/MatchSum/train_matching.py diff --git a/MatchSum/utils.py b/fastSum/MatchSum/utils.py similarity index 100% rename from MatchSum/utils.py rename to fastSum/MatchSum/utils.py diff --git a/PointerGen/README.md b/fastSum/PointerGen/README.md similarity index 100% rename from PointerGen/README.md rename to fastSum/PointerGen/README.md diff --git a/PointerGen/data_util/__pycache__/config.cpython-37.pyc b/fastSum/PointerGen/data_util/__pycache__/config.cpython-37.pyc similarity index 100% rename from PointerGen/data_util/__pycache__/config.cpython-37.pyc rename to fastSum/PointerGen/data_util/__pycache__/config.cpython-37.pyc diff --git a/PointerGen/data_util/__pycache__/data.cpython-37.pyc b/fastSum/PointerGen/data_util/__pycache__/data.cpython-37.pyc similarity index 100% rename from PointerGen/data_util/__pycache__/data.cpython-37.pyc rename to fastSum/PointerGen/data_util/__pycache__/data.cpython-37.pyc diff --git a/PointerGen/data_util/__pycache__/logging.cpython-37.pyc b/fastSum/PointerGen/data_util/__pycache__/logging.cpython-37.pyc similarity index 100% rename from PointerGen/data_util/__pycache__/logging.cpython-37.pyc rename to fastSum/PointerGen/data_util/__pycache__/logging.cpython-37.pyc diff --git a/PointerGen/data_util/__pycache__/utils.cpython-37.pyc b/fastSum/PointerGen/data_util/__pycache__/utils.cpython-37.pyc similarity index 100% rename from PointerGen/data_util/__pycache__/utils.cpython-37.pyc rename to fastSum/PointerGen/data_util/__pycache__/utils.cpython-37.pyc diff --git a/PointerGen/data_util/config.py b/fastSum/PointerGen/data_util/config.py similarity index 100% rename from PointerGen/data_util/config.py rename to fastSum/PointerGen/data_util/config.py diff --git a/PointerGen/data_util/data.py b/fastSum/PointerGen/data_util/data.py similarity index 100% rename from PointerGen/data_util/data.py rename to fastSum/PointerGen/data_util/data.py diff --git a/PointerGen/data_util/logging.py b/fastSum/PointerGen/data_util/logging.py similarity index 100% rename from PointerGen/data_util/logging.py rename to fastSum/PointerGen/data_util/logging.py diff --git a/PointerGen/data_util/utils.py b/fastSum/PointerGen/data_util/utils.py similarity index 100% rename from PointerGen/data_util/utils.py rename to fastSum/PointerGen/data_util/utils.py diff --git a/PointerGen/decode.py b/fastSum/PointerGen/decode.py similarity index 100% rename from PointerGen/decode.py rename to fastSum/PointerGen/decode.py diff --git a/PointerGen/model/__pycache__/loss.cpython-37.pyc b/fastSum/PointerGen/model/__pycache__/loss.cpython-37.pyc similarity index 100% rename from PointerGen/model/__pycache__/loss.cpython-37.pyc rename to fastSum/PointerGen/model/__pycache__/loss.cpython-37.pyc diff --git a/PointerGen/model/__pycache__/metric.cpython-37.pyc b/fastSum/PointerGen/model/__pycache__/metric.cpython-37.pyc similarity index 100% rename from PointerGen/model/__pycache__/metric.cpython-37.pyc rename to fastSum/PointerGen/model/__pycache__/metric.cpython-37.pyc diff --git a/PointerGen/model/__pycache__/model.cpython-37.pyc b/fastSum/PointerGen/model/__pycache__/model.cpython-37.pyc similarity index 100% rename from PointerGen/model/__pycache__/model.cpython-37.pyc rename to fastSum/PointerGen/model/__pycache__/model.cpython-37.pyc diff --git a/PointerGen/model/loss.py b/fastSum/PointerGen/model/loss.py similarity index 100% rename from PointerGen/model/loss.py rename to fastSum/PointerGen/model/loss.py diff --git a/PointerGen/model/metric.py b/fastSum/PointerGen/model/metric.py similarity index 100% rename from PointerGen/model/metric.py rename to fastSum/PointerGen/model/metric.py diff --git a/PointerGen/model/model.bak.py b/fastSum/PointerGen/model/model.bak.py similarity index 100% rename from PointerGen/model/model.bak.py rename to fastSum/PointerGen/model/model.bak.py diff --git a/PointerGen/model/model.py b/fastSum/PointerGen/model/model.py similarity index 100% rename from PointerGen/model/model.py rename to fastSum/PointerGen/model/model.py diff --git a/fastSum/PointerGen/model/mq b/fastSum/PointerGen/model/mq new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PointerGen/train.py b/fastSum/PointerGen/train.py similarity index 100% rename from PointerGen/train.py rename to fastSum/PointerGen/train.py diff --git a/PointerGen/training_ptr_gen/__pycache__/callback.cpython-37.pyc b/fastSum/PointerGen/training_ptr_gen/__pycache__/callback.cpython-37.pyc similarity index 100% rename from PointerGen/training_ptr_gen/__pycache__/callback.cpython-37.pyc rename to fastSum/PointerGen/training_ptr_gen/__pycache__/callback.cpython-37.pyc diff --git a/PointerGen/training_ptr_gen/callback.py b/fastSum/PointerGen/training_ptr_gen/callback.py similarity index 100% rename from PointerGen/training_ptr_gen/callback.py rename to fastSum/PointerGen/training_ptr_gen/callback.py diff --git a/PointerGen/training_ptr_gen/main.py b/fastSum/PointerGen/training_ptr_gen/main.py similarity index 100% rename from PointerGen/training_ptr_gen/main.py rename to fastSum/PointerGen/training_ptr_gen/main.py diff --git a/fastSum/__init__.py b/fastSum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391