diff --git a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README.md b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README.md index 984ff4a1028e3a40a78539669a5b4832ec338256..36a99fc64ceb2011cad1a19b8703efd58103ee3f 100644 --- a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README.md +++ b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README.md @@ -157,7 +157,7 @@ python3 xnlp_fmk.py \ **ALBERT_zh** *** * ALBERT_zh使用albert_zh做为模型的名称, 每个下游任务各自做为模型名称。 -* ALBERT_zh支持afqmc,cmnli,csl,iflytek,tnews和wsc任务 +* ALBERT_zh支持afqmc,cmnli,csl,iflytek,tnews,lcqmc和wsc任务 * 改变模型入参,以支持不同的任务 * 仅ALBERT_zh Tiny测试过 *** @@ -166,7 +166,7 @@ python3 xnlp_fmk.py \ * --output_dir的传参与--data_dir相同, 预处理脚本会将文本转换为该路径下的bin文件 * --vocab_file, --bert_config_file, --do_lower_case, --max_seq_length, --doc_stride等参数进行微调 * --model_name:当进行ALBERT_en任务时,参数为albert_en -* --task_name为下游所需的任务名, 仅支持afqmc, cmnli, csl, iflytek, tnews 和 wsc 任务 +* --task_name为下游所需的任务名, 仅支持afqmc, cmnli, csl, iflytek, tnews, lcqmc 和 wsc 任务 ```Bash python3 xnlp_fmk.py \ --data_dir=./data/TNEWS \ diff --git a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README_EN.md b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README_EN.md index f6a501beb2e6cb9c08046b3b435f67ab778bfe0d..69192912fa0e649e261ebe510f4872e963daf495 100644 --- a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README_EN.md +++ b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/README_EN.md @@ -157,7 +157,7 @@ python3 xnlp_fmk.py \ **ALBERT_zh** *** * ALBERT_zh use albert_zh for model_name parameter, each downstream task name for task_name -* ALBERT_zh support afqmc, cmnli, csl, iflytek, tnews and wsc tasks +* ALBERT_zh support afqmc, cmnli, csl, iflytek, tnews , lcqmc and wsc tasks * Change the parameters for different tasks * Only ALBERT_zh Tiny has been tested *** @@ -166,7 +166,7 @@ python3 xnlp_fmk.py \ * Change --output_dir to the same with --data_dir, and preprocess script will convert text to bin files under this path * Keep the --vocab_file, --bert_config_file, --do_lower_case, --max_seq_length, --doc_stride, etc. the same with fine-tuning parameters * Keep the --model_name=albert_zh when do the ALBERT_zh tasks -* Change --task_name to the downstream task you want to do, only support afqmc, cmnli, csl, iflytek, tnews and wsc tasks +* Change --task_name to the downstream task you want to do, only support afqmc, cmnli, csl, iflytek, tnews, lcqmc and wsc tasks ```Bash python3 xnlp_fmk.py \ --data_dir=./data/TNEWS \ diff --git a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/libs/convert_txt2json.py b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/libs/convert_txt2json.py new file mode 100644 index 0000000000000000000000000000000000000000..32d4e0170115c4a9e8c227187a861b6718a8c614 --- /dev/null +++ b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/libs/convert_txt2json.py @@ -0,0 +1,19 @@ +#针对lcqmc数据集,做txt到json文件转换 +import json + +def convert_txt2Json(txt_path, json_path): + ''' + txt_path为lcqmc数据集dev.txt路径 + json_path为输出json文件的路径 + ''' + dev_txt = open(txt_path) + lines = dev_txt.read().split('\n') + lines.pop() + with open(json_path, 'w') as f: + for line in lines: + json_line = line.split('\t') + dict_line = {"sentence1": json_line[0], "sentence2": json_line[1], "label": json_line[2]} + arr = json.dumps(dict_line) + f.write(arr + '\n') + dev_txt.close() + diff --git a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/network/run_albert_lcqmc.py b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/network/run_albert_lcqmc.py new file mode 100644 index 0000000000000000000000000000000000000000..82ba2ea108d33c481b3a7e6e0f5af0cc9c664230 --- /dev/null +++ b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/network/run_albert_lcqmc.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# @Author: fengtingyan +# @Date: 2023-3-29 21:11:00 +# @Last Modified by: fengtingyan +# @Last Modified time: 2023-3-29 21:11:00 + +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import csv +import json +import os + +import tensorflow.compat.v1 as tf +from absl import flags + +from network import tokenization +from network.classifier_utils import DataProcessor, InputExample + +FLAGS = flags.FLAGS + +__all__ = 'LcqmcProcessor' + + +class LcqmcProcessor(DataProcessor): + """Processor for the AFQMC data set.""" + + def get_examples(self): + """See base class.""" + return self.create_examples(self.read_json(os.path.join(FLAGS.data_dir, "dev.json")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def create_examples(lines, set_type): + """Creates examples for the training and dev sets.""" + #todo + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line['sentence1']) + text_b = tokenization.convert_to_unicode(line['sentence2']) + label = tokenization.convert_to_unicode(line['label']) if set_type != 'test' else '0' + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + @staticmethod + def read_label(input_file): + with tf.io.gfile.GFile(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=None) + labels = [] + for (i, line) in enumerate(reader): + data_json = json.loads(line[0]) + labels.append(int(data_json['label'])) + return labels diff --git a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/xnlp_fmk.py b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/xnlp_fmk.py index b5c26ce44e290dc059c09b2826b5eb786af366dc..723312d01aee5f4f684cff3e6b3f6d40a5e02f9f 100644 --- a/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/xnlp_fmk.py +++ b/ACL_TensorFlow/contrib/nlp/ALBERT_for_ACL/xnlp_fmk.py @@ -30,6 +30,7 @@ from libs.predict_pb import pb_predict from libs.preprocess import preprocess from network.run_albert_afqmc import AfqmcProcessor as AlbertAfqmc from network.run_albert_cmnli import CmnliProcessor as AlbertCmnli +from network.run_albert_lcqmc import LcqmcProcessor as AlbertLcqmc from network.run_albert_csl import CslProcessor as AlbertCsl from network.run_albert_iflytek import IflytekProcessor as AlbertIflytek from network.run_albert_race import RaceProcessor as AlbertRace @@ -66,6 +67,7 @@ MODEL = { "iflytek": AlbertIflytek, "tnews": BertTnews, "wsc": AlbertWsc, + "lcqmc": AlbertLcqmc, }, "bert": { # BERT by google