From 64c776ca9068de4b13c55a5c75221ca943e05975 Mon Sep 17 00:00:00 2001 From: "L.L" <12826711+lct_ll@user.noreply.gitee.com> Date: Sun, 20 Oct 2024 16:17:32 +0800 Subject: [PATCH 1/6] add prott5 --- MindSPONGE/applications/model_cards/ProtT5.md | 149 ++++++++++++ .../ProtT5/t5_downstream_task_eval.yaml | 13 ++ .../ProtT5/t5_downstream_task_train.yaml | 13 ++ .../model_configs/ProtT5/t5_predict.yaml | 16 ++ .../model_configs/ProtT5/t5_pretrain.yaml | 16 ++ .../model_configs/ProtT5/t5_xl.yaml | 43 ++++ .../mindsponge/pipeline/models/__init__.py | 2 + .../pipeline/models/protT5/__init__.py | 21 ++ .../models/protT5/downstream/__init__.py | 15 ++ .../models/protT5/downstream/deeploc_task.py | 94 ++++++++ .../downstream/downstream_configuration.py | 23 ++ .../protT5/downstream/downstream_nets.py | 107 +++++++++ .../protT5/downstream/downstream_task.py | 106 +++++++++ .../models/protT5/downstream/hhblits_task.py | 114 ++++++++++ .../downstream/protT5_downstream_tasks.py | 76 +++++++ .../models/protT5/downstream/task_datasets.py | 198 ++++++++++++++++ .../models/protT5/pretrain/__init__.py | 15 ++ .../models/protT5/pretrain/optimization.py | 55 +++++ .../protT5/pretrain/pretrain_configuration.py | 23 ++ .../protT5/pretrain/pretrain_dataloader.py | 116 ++++++++++ .../models/protT5/pretrain/pretrain_protT5.py | 214 ++++++++++++++++++ .../models/protT5/pretrain/t5_modeling.py | 97 ++++++++ .../models/protT5/scripts/convert_weight.py | 180 +++++++++++++++ .../protT5/scripts/trans_csv_to_mindrecord.py | 114 ++++++++++ .../pipeline/models/protT5/utils/utils.py | 34 +++ .../src/mindsponge/pipeline/pipeline.py | 4 + 26 files changed, 1858 insertions(+) create mode 100644 MindSPONGE/applications/model_cards/ProtT5.md create mode 100644 MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml create mode 100644 MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml create mode 100644 MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml create mode 100644 MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml create mode 100644 MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/__init__.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/__init__.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/deeploc_task.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_configuration.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_nets.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_task.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/hhblits_task.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/protT5_downstream_tasks.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/task_datasets.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/__init__.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/optimization.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_configuration.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_dataloader.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_protT5.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/t5_modeling.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/convert_weight.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/trans_csv_to_mindrecord.py create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/protT5/utils/utils.py diff --git a/MindSPONGE/applications/model_cards/ProtT5.md b/MindSPONGE/applications/model_cards/ProtT5.md new file mode 100644 index 000000000..2061e55b6 --- /dev/null +++ b/MindSPONGE/applications/model_cards/ProtT5.md @@ -0,0 +1,149 @@ +# ProtT5 + +## 模型介绍 +计算生物学和生物信息学从蛋白质序列中获得了大量的数据,非常适合使用自然语言处理中的语言模型。这些语言模型以低推理成本达到了新的预测效果。ProtTrans提供了先进的预训练模型用于蛋白质研究。其中,ProtT5是项目中多个预训练模型中,效果最好的。 +详细信息见项目主页及论文:[github](https://github.com/agemagician/ProtTrans) + +我们提供了Mindspore框架下, ProtT5模型的checkpoint, 预测接口, 训练接口;同时提供了ProtT5预训练模型在相关下游任务中的训练接口和预测接口。 +ProtTrans论文中有两类下游任务,预测蛋白质相关性质和氨基酸的性质,对应的分别是sample level和token level的分类模型; 实验具体信息可以参考作者论文和项目主页中的描述。 +下游任务实验的训练数据, 评测数据都可以从项目主页中提供的链接下载。 + + +### 模型权重获取 +可以从如下链接中下载torch版本的checkpoint;[模型页面](https://huggingface.co/Rostlab/prot_t5_xl_uniref50)。 然后使用`protT5/scripts`文件夹中的`convert_weight.py`脚本转换为mindspore支持的格式, 脚本使用方式如下: +```shell +python scripts/convert_weight.py --layers 24 --torch_path pytorch_model.bin --mindspore_path ./mindspore_t5.ckpt +``` +转换完成后,需要添加yaml格式的配置文件; 具体可以参考: `model_configs/ProtT5/t5_xl.yaml`; 直接把这个文件复制到相应的目录中也可以; 配置文件参数的含义可以参考mindformers中t5_config定义。 这些文件在`MindSPONGE`项目下。 + +- 文件结构 +```bash +# checkpoint文件组织格式如下 +└── prot_t5_xl_uniref50 + ├── prot_t5_xl_uniref50.ckpt # 权重文件; 需要从torch的bin文件转换而来 + ├── prot_t5.yaml # 网络配置文件,需要手动添加 + ├── special_tokens_map.json # tokenizer + ├── spiece.model # tokenizer model + └── tokenizer_config.json # tokenizer config +``` + +## 如何使用 + +### Dependencies +* mindspore >= 2.3.0 +* mindformers >= 1.2.0 +* sentencepiece >= 0.2.0 + +### ProtT5预测 +```bash +from mindsponge import PipeLine + +config_path = 'configs/t5_predict.yaml' # 根据需要改为本地路径 +pipe = PipeLine(name = "ProtT5") +pipe.set_device_id(0) +pipe.initialize(config_path=config_path) + +# pridict +data = ["A E T C Z A O", "S K T Z P"] +res = pipe.predict(data, mode="generate") +print("Generated:", res) +# Generated: ['A E T C X A X', 'S K T X P'] + +res = pipe.predict(data, mode="embedding") +print("Embedding:", res) +# Embedding: +[[[ 1.71719193e-01 -1.40796244e-01 -2.04709724e-01 ... 1.45269990e-01 + 1.47509247e-01 -7.32109100e-02] + [ 9.36630294e-02 -1.16918117e-01 -2.99756974e-01 ... 1.00125663e-01 + -2.26259604e-01 2.25636318e-01] + [ 1.93479404e-01 -9.52076018e-02 -2.92140573e-01 ... 6.69623986e-02 + 3.05505600e-02 1.31701231e-01] + ... +``` + +### ProtT5预训练 +```bash +# 单卡; 按照配置文件配置好yaml文件 +config_path = 'configs/t5_pretrain.yaml' # 根据需要改为本地路径 +pipe = PipeLine(name = "ProtT5") +pipe.initialize(config_path=config_path) +pipe.model.init_trainer() +pipe.model.train() + +# 使用多卡并行; run_pretrain.py中代码就是单卡的代码,使用msrun启动 +msrun --worker_num=${worknum} --local_worker_num=${worknum} --master_port=8128 --log_dir=msrun_log --join=True --cluster_time_out=600 ./run_pretrain.py +``` + + +### 下游任务 +```bash +import mindspore as ms +from mindsponge import PipeLine + +pipe = PipeLine(name = "ProtT5Downstream") +pipe.set_device_id(0) +config_path = 'configs/t5_downstream_task_eval.yaml' +pipe.initialize(config_path=config_path) + +# pridict +data = ["S L R F T A S T S T P K S G S K I A K R G K K H P E P V A S W M S E Q R W A G E P E V M C T L Q H K S I A Q E A Y K N Y T I T T S A V C K L V R Q L Q Q Q A L S L Q V H F E R S E R V L S G L Q A S S L P E A L A G A T Q L L S H L D D F T A T L E R R G V F F N D A K I E R R R Y E Q H L E Q I R T V S K D T R Y S L E R Q H Y I N L E S L L D D V Q L L K R H T L I T L R L I F E R L V R V L V I S I E Q S Q C D L L L R A N I N M V A T L M N I D Y D G F R S L S D A F V Q N E A V R T L L V V V L D H K Q S S V R A L A L R A L A T L C C A P Q A I N Q L G S C G G I E I V R D I L Q V E S A G E R G A I E R R E A V S L L A Q I T A A W H G S E H R V P G L R D C A E S L V A G L A A L L Q P E"] +res = pipe.predict(data) +print("Output:", res) +# Output: ['Cytoplasm'] + +# 评估测试集; 项目主页有数据集下载地址 +eval_data_path = "./dataset/deeploc_test_set.csv" +pipe.model.eval_acc(eval_data_path) +# Accuracy 0.8129 + +# train +# config文件中设置好train_data_path和eval_data_path等参数 +# yaml文件中parallel设为True +ms.set_context(mode=ms.GRAPH_MODE, device_target='Ascend', device_id=0) +pipe = PipeLine(name = "ProtT5Downstream") +config_path = 'configs/t5_downstream_task_train.yaml' +pipe.initialize(config_path=config_path) +pipe.model.train() +``` + +### 预训练说明 +ProtTrans主要工作是在蛋白质氨基酸序列上训练的预训练模型, 下面是模型训练相关一些说明。 + +- 数据转换 + +为了训练效率,首先需要原始数据转换成mindrecord格式。原始的预训练数据可以使用`uniref50`数据, 下面是数据转换脚本的路径及其使用方式。`number_samples`指定了想转换的样本数量,默认是`-1`转换全部数据。 +```shell +# 参数分别是: 原始csv数据目录; 转换后的目录; 模型checkpoint路径; 转换样本数 +python scripts/trans_csv_to_mindrecord.py --data_dir ../unif50 --output_dir ../unif50_mindrecord --t5_config_path ../prot_t5_xl_uniref50 --number_samples 50000 +``` + +- T5参数配置 + +除了网络中每层的参数量和dropout比例,下面几个参数也需要注意 +```yaml +# 初始化权重缩放比例; 一般小于等于1 +initializer_factor: 1.0 + +# 每一层的数据类型,兼容混合精度: float32 或 float16 +# T5模型中建议全部使用float32 +param_init_type: "float32" +layernorm_compute_type: "float32" +softmax_compute_type: "float32" +compute_dtype: "float32" +``` + + +## 引用 + +```bash +@article{9477085, + author={Elnaggar, Ahmed and Heinzinger, Michael and Dallago, Christian and Rehawi, Ghalia and Yu, Wang and Jones, Llion and Gibbs, Tom and Feher, Tamas and Angerer, Christoph and Steinegger, Martin and Bhowmik, Debsindhu and Rost, Burkhard}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + title={ProtTrans: Towards Cracking the Language of Lifes Code Through Self-Supervised Deep Learning and High Performance Computing}, + year={2021}, + volume={}, + number={}, + pages={1-1}, + doi={10.1109/TPAMI.2021.3095381} +} +``` \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml new file mode 100644 index 000000000..2281405ed --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml @@ -0,0 +1,13 @@ +mode: eval +task_name: "hhblits" # "hhblits" or "deeploc" +t5_config_path: "prot_t5_xl_uniref50_ms" # T5 base模型; 包含yaml,ckpt,tokenizor model的目录 +checkpoint_path: "./model_deeploc_loc.ckpt" # 下游任务的ckpt; 未开源需要自己训练 + +train: + lr: 0.001 + epochs: 4 + batch_size: 16 + train_data_path: "" # 预测时为空 + eval_data_path: "" + checkpoint_save_path: null + cate_name: '' \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml new file mode 100644 index 000000000..76c1dad84 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml @@ -0,0 +1,13 @@ +mode: train +task_name: "deeploc" # "hhblits" or "deeploc" +t5_config_path: "prot_t5_xl_uniref50_ms" # T5 base模型;包含yaml,ckpt,tokenizor model的目录 +checkpoint_path: null + +train: + lr: 0.001 # learning rate; 默认使用Cosine decay learning rate + epochs: 4 + batch_size: 16 + train_data_path: "./deeploc_our_train_set.csv" + eval_data_path: "./deeploc_test_set.csv" + checkpoint_save_path: null + cate_name: "membrane" # 两个任务,通过cate_name区分: membrane和loc; hhblits任务不需要 \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml new file mode 100644 index 000000000..826c16ac5 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml @@ -0,0 +1,16 @@ +mode: "eval" # eval或者train +parallel: False # 是否使用并行 +t5_config_path: "prot_t5_xl_uniref50_ms/" # T5 base模型;包含yaml,ckpt,tokenizor model的目录 +load_model_path: null + +# 训练相关参数 +train: + train_data_path: '' + lr: 2.0e-5 + warmup_steps: 0 + batch_size: 32 + epochs: 1 + save_steps: 20000 + save_ckpt_path: "output/" + use_clip_grad: True + max_grad_norm: 1 \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml new file mode 100644 index 000000000..bccef82e0 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml @@ -0,0 +1,16 @@ +mode: "train" +parallel: False # 是否使用并行 +t5_config_path: "prot_t5_xl_uniref50_ms/t5_xl.yaml" # T5 yaml配置文件 +load_model_path: null # 可以配置checkpoint ckpt路径;用于继续训练 + +# 训练相关参数 +train: + train_data_path: 'train_data' # 训练文件目录;训练文件为mindrecord格式 + lr: 2.0e-5 # learning rate + warmup_steps: 0 + batch_size: 32 + epochs: 1 + save_steps: 20000 + save_ckpt_path: "output/" + use_clip_grad: True + max_grad_norm: 1 \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml new file mode 100644 index 000000000..79fa20598 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml @@ -0,0 +1,43 @@ +model: + arch: + type: T5ForConditionalGeneration + model_config: + attention_dropout_rate: 0.1 + batch_size: 8 + d_ff: 16384 + d_kv: 128 + do_sample: false + embedding_dropout_prob: 0.1 + eos_token_id: 1 + has_relative_bias: true + hidden_act: relu + hidden_dropout_rate: 0.1 + hidden_size: 1024 + is_encoder_decoder: true + layer_norm_epsilon: 1.0e-06 + length_penalty_weight: 1.0 + max_position_embeddings: 512 + num_heads: 32 + num_layers: 24 + offset: 0 + pad_token_id: 0 + post_layernorm_residual: false + relative_attention_num_buckets: 32 + repetition_penalty: 1 + scale_output: true + seq_length: 512 + max_decode_length: 512 + start_token_id: 0 + top_k: 1 + top_p: 0.95 + type: T5Config + use_cache: False + use_past: False + vocab_size: 128 + + initializer_factor: 0.7 + initializer_range: 0.02 + param_init_type: "float32" + layernorm_compute_type: "float32" + softmax_compute_type: "float32" + compute_dtype: "float32" \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/__init__.py index 3e0858a40..d3e15ebc7 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/__init__.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/__init__.py @@ -35,3 +35,5 @@ from .multimer import Multimer, MultimerDataSet, multimer_configuration from .proteinmpnn import ProteinMpnn, ProteinMpnnDataset, proteinmpnn_configuration from .ufold import UFold, UFoldDataSet, ufold_configuration from .rasp import RASP, RASPDataSet, rasp_configuration +from .protT5 import ProtT5, ProtT5TrainDataSet, protT5pretrain_configuration +from .protT5 import ProtT5DownstreamTasks, ProtT5TaskDataSet, protT5downtask_configuration \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/__init__.py new file mode 100644 index 000000000..06f0d432d --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from .pretrain.pretrain_protT5 import ProtT5 +from .pretrain.pretrain_dataloader import ProtT5TrainDataSet +from .pretrain.pretrain_configuration import protT5pretrain_configuration + +from .downstream.protT5_downstream_tasks import ProtT5DownstreamTasks +from .downstream.task_datasets import ProtT5TaskDataSet +from .downstream.downstream_configuration import protT5downtask_configuration \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/__init__.py new file mode 100644 index 000000000..175a0ece7 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Module""" \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/deeploc_task.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/deeploc_task.py new file mode 100644 index 000000000..462c74cc4 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/deeploc_task.py @@ -0,0 +1,94 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Pretrain property prediction task; deeploc dataset.""" +from mindspore import nn +import mindspore.ops as ops + +from .downstream_nets import MeanPoolingClassifier, EmbeddingTaskNet +from .downstream_task import BaseTask, lr_secheduler +from .task_datasets import create_deeploc_dataset, map_label_to_category, \ + LOC_CATES, LOC_LABEL_TO_CATE, MEMBRANE_LABEL_TO_CATE + + +class DeeplocTask(BaseTask): + """ + Pretrain property prediction task of deeploc dataset. The network is a mean pooling classifier of embeddings. + """ + def __init__(self, config): + """ + This method initializes the network, has train and eval interface. + cate_name represents different tasks: "loc", "membrane" + """ + super().__init__(config) + self.cate_name = config.train.cate_name + + if self.cate_name == 'loc': + self.label_to_cate = LOC_LABEL_TO_CATE + num_classes = len(LOC_CATES) + else: + self.label_to_cate = MEMBRANE_LABEL_TO_CATE + num_classes = 2 + + # apply mean pooling in hidden states of prot T5 model encoder + mpc_net = MeanPoolingClassifier(num_classes) + self.net = EmbeddingTaskNet(mpc_net, self.t5_config_path) + + if self.checkpoint_path: + self.net.load_from_pretrained(self.checkpoint_path) + + if self.train_conf.train_data_path: + self.train_dataset = create_deeploc_dataset(self.train_conf.train_data_path, + self.t5_tokenizer, batch_size=self.train_conf.batch_size, cate_name=self.cate_name) + self.loss_fn = nn.CrossEntropyLoss(reduction='mean') + batch_num = self.train_dataset.get_dataset_size() + learning_rate = lr_secheduler(self.train_conf.lr, batch_num, self.train_conf.epochs) + self.optimizer = nn.Adam(self.net.trainable_params(), learning_rate=learning_rate) + + if self.train_conf.eval_data_path: + self.eval_dataset = create_deeploc_dataset(self.train_conf.eval_data_path, + self.t5_tokenizer, batch_size=self.train_conf.batch_size, cate_name=self.cate_name) + + @staticmethod + def __eval_fn(model_fn, dataset): + metric = nn.Accuracy('classification') + metric.clear() + for inputs, masks, targets in dataset: + logits = model_fn(inputs, masks) + metric.update(logits, targets) + + accuracy = metric.eval() + dataset.reset() + return accuracy + + def eval_fn(self, dataset): + return DeeplocTask.__eval_fn(self.net, dataset) + + def forward_fn(self, inputs, masks, targets): + logits = self.net(inputs, masks) + loss = self.loss_fn(logits, targets) + return loss + + def eval_acc(self, eval_data_path): + eval_dataset = create_deeploc_dataset(eval_data_path, self.t5_tokenizer, + batch_size=self.train_conf.batch_size, cate_name=self.cate_name) + return self.eval_fn(eval_dataset) + + def predict(self, data): + logits = self.net(*data) + softmax = ops.Softmax(axis=1) + probabilities = softmax(logits) + predicted_labels = ops.Argmax(axis=1)(probabilities).asnumpy() + predicted_cates = map_label_to_category(predicted_labels, self.label_to_cate) + return predicted_cates \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_configuration.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_configuration.py new file mode 100644 index 000000000..712eb66c9 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_configuration.py @@ -0,0 +1,23 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"protT5 downstream task configure" +protT5downtask_configuration = { + "protT5_base": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml", + "protT5downtask_predict": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml", + "protT5downtask_train": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml" +} \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_nets.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_nets.py new file mode 100644 index 000000000..26772ddd7 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_nets.py @@ -0,0 +1,107 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Networks of downstream task.""" +import logging + +import mindspore as ms +import mindspore.nn as nn +import mindspore.ops as ops +from mindformers import T5ForConditionalGeneration + + +logger = logging.getLogger(__name__) +EMBEDIING_LENGTH = 1024 + + +class ConvNet(nn.Cell): + """acid token level predictor; using convolution net to convergence of local information.""" + def __init__(self): + super().__init__() + # CNN weights are trained on ProtT5 embeddings + self.feature_extractor = nn.SequentialCell([ + nn.Conv2d(EMBEDIING_LENGTH, 32, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True), # 7x32 + nn.ReLU(), + nn.Dropout(p=0.1), + ]) + + n_final_in = 32 + self.dssp3_classifier = nn.Conv2d(n_final_in, 3, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + self.dssp8_classifier = nn.Conv2d(n_final_in, 8, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + self.diso_classifier = nn.Conv2d(n_final_in, 2, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + + def construct(self, embeddings, masks): + # IN: X = (B x L x F); OUT: (B x F x L, 1) + x = embeddings * ops.expand_dims(masks, -1) + x = ops.Transpose()(x, (0, 2, 1)).unsqueeze(-1) + x = self.feature_extractor(x) # OUT: (B x 32 x L x 1) + d3_clf = self.dssp3_classifier(x).squeeze(-1).transpose((0, 2, 1)) # OUT: (B x L x 3) + d8_clf = self.dssp8_classifier(x).squeeze(-1).transpose((0, 2, 1)) # OUT: (B x L x 8) + diso_clf = self.diso_classifier(x).squeeze(-1).transpose((0, 2, 1)) # OUT: (B x L x 2) + return d3_clf, d8_clf, diso_clf + + +class MeanPoolingClassifier(nn.Cell): + def __init__(self, num_classes): + super().__init__() + self.num_classes = num_classes + + self.dense = nn.Dense(EMBEDIING_LENGTH, 32, activation='relu') + self.classifier = nn.Dense(32, num_classes) + self.dropout = nn.Dropout(p=0.1) + + def construct(self, embeddings, masks): + masks = ops.cast(masks, ms.float32) + masked_inputs = embeddings * ops.expand_dims(masks, -1) + + mean_pooled = ops.ReduceMean(keep_dims=False)(masked_inputs, 1) + + mean_pooled = self.dropout(mean_pooled) + compressed = self.dense(mean_pooled) + output = self.classifier(compressed) + return output + + +class EmbeddingTaskNet(nn.Cell): + """Base net of Embedding part for downstream task.""" + def __init__(self, downstream_net, t5_config_path): + super(EmbeddingTaskNet, self).__init__() + self.downstream_net = downstream_net + + self.t5 = T5ForConditionalGeneration.from_pretrained(t5_config_path) + self.t5.set_train(False) + + # freeze pretrain model parameters + for param in self.t5.trainable_params(): + param.requires_grad = False + + def construct(self, inputs, masks): + masks = ops.cast(masks, ms.float32) + embeddings = self.t5.encoder_forward(inputs, masks) + output = self.downstream_net(embeddings, masks) + return output + + def load_from_pretrained(self, config_path): + # load downstream task checkpoint + non_pretrained_param_dict = ms.load_checkpoint(config_path) + param_not_load, _ = ms.load_param_into_net(self.downstream_net, non_pretrained_param_dict) + self.downstream_net.set_train(False) + self.set_train(False) + logger.warning("Not Loaded param list: %s", param_not_load) + + def save_checkpoint(self, model_path): + non_pretrained_param_dict = {} + for param in self.downstream_net.trainable_params(): + non_pretrained_param_dict[param.name] = param.data + ms.save_checkpoint(non_pretrained_param_dict, model_path) diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_task.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_task.py new file mode 100644 index 000000000..7d6047f2d --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_task.py @@ -0,0 +1,106 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Base trainer of downstream tasks.""" +from abc import abstractmethod + +from mindspore import nn, value_and_grad +from mindformers import T5Tokenizer +from mindformers.tools.logger import get_logger + + +logger = get_logger(logger_name='DownstreamTask') +PRINT_STEPS = 80 + + +def lr_secheduler(init_lr, batch_num, epochs): + """Cosine decay learning rate""" + lr_max = init_lr # max lr + lr_min = 5e-5 # min lr + decay_steps = int(epochs * batch_num) + lr_sch = nn.CosineDecayLR(min_lr=lr_min, max_lr=lr_max, decay_steps=decay_steps) + return lr_sch + + +class BaseTask: + """ + Base class of downstream tasks with train and eval interface. + + Args: + config.cate_name: the dataset has two subtask with different label name: 'loc', 'membrane' + config.t5_config_path: prot t5 pretrain model directory path. + config.checkpoint_path: the task checkpoint path; use to eval. + """ + def __init__(self, config): + self.mode = config.mode + self.task_name = config.task_name + self.t5_config_path = config.t5_config_path + self.checkpoint_path = config.checkpoint_path + + self.train_conf = config.train + self.checkpoint_save_path = config.train.checkpoint_save_path + self.epochs = config.train.epochs + + self.net = None + self.train_dataset = None + self.eval_dataset = None + self.grad_fn = None + self.t5_tokenizer = T5Tokenizer.from_pretrained(config.t5_config_path) + + @abstractmethod + def eval_acc(self, eval_data_path): + pass + + @abstractmethod + def forward_fn(self, *args): + pass + + @abstractmethod + def eval_fn(self, dataset): + pass + + def train_step(self, *args): + loss, grads = self.grad_fn(*args) + self.optimizer(grads) + return loss + + def train(self): + weights = self.net.trainable_params() + self.grad_fn = value_and_grad(self.forward_fn, None, weights) + + logger.info("Begin training...") + for epoch in range(self.epochs): + logger.info("Epoch: %d", epoch) + step = 0 + loss_steps = 0.0 + for inputs in self.train_dataset: + step += 1 + loss = self.train_step(*inputs) + loss_steps += loss.asnumpy() + if step % PRINT_STEPS == 0: + logger.info("loss: %.4f", loss_steps / PRINT_STEPS) + loss_steps = 0.0 + + self.train_dataset.reset() + + logger.info("Training done") + + if self.eval_dataset: + logger.info("Begin eval...") + acc = self.eval_fn(self.eval_dataset) + logger.info("Accuracy: %s", str(acc)) + + if self.checkpoint_save_path: + self.net.save_checkpoint(self.checkpoint_save_path) + logger.info("Checkpoint dumpped successful") \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/hhblits_task.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/hhblits_task.py new file mode 100644 index 000000000..dd224f814 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/hhblits_task.py @@ -0,0 +1,114 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from mindspore import nn +import mindspore.ops as ops + +from .task_datasets import create_hhblits_dataset, map_label_to_category, \ + LABEL_MASKER, HHBLITS_D3_LABEL_TO_CATE, HHBLITS_D8_LABEL_TO_CATE +from .downstream_nets import ConvNet, EmbeddingTaskNet +from .downstream_task import BaseTask, lr_secheduler + + +class TokenLevelAccuracy: + """Sequence token level classify task accuracy.""" + def __init__(self, num_classes): + self.num_classes = num_classes + self.acc = nn.Accuracy('classification') + self.acc.clear() + + def update(self, logits, labels): + act_labels = labels.view(-1) + act_logits = logits.view(-1, self.num_classes) + valid_labels = act_labels[act_labels > -1] + valid_logits = act_logits[act_labels > -1] + + self.acc.update(valid_logits, valid_labels) + + def get(self): + return self.acc.eval() + + +class HHblitsTask(BaseTask): + """Pretrain property prediction task of hhblits dataset. The network is a convnet and token level classifier.""" + def __init__(self, config): + super().__init__(config) + cnn_net = ConvNet() + self.net = EmbeddingTaskNet(cnn_net, self.t5_config_path) + + if self.checkpoint_path: + self.net.load_from_pretrained(self.checkpoint_path) + + if self.train_conf.train_data_path: + self.train_dataset = create_hhblits_dataset(self.train_conf.train_data_path, self.t5_tokenizer, self.train_conf.batch_size) + batch_num = self.train_dataset.get_dataset_size() + learning_rate = lr_secheduler(self.train_conf.lr, batch_num, self.train_conf.epochs) + self.optimizer = nn.Adam(self.net.trainable_params(), learning_rate=learning_rate) + self.loss_fn = nn.CrossEntropyLoss(reduction='mean', ignore_index=LABEL_MASKER) + + # eval + if self.train_conf.eval_data_path: + self.eval_dataset = create_hhblits_dataset(self.train_conf.eval_data_path, self.t5_tokenizer, self.train_conf.batch_size) + + @staticmethod + def __eval_fn(model_fn, dataset): + """eval dataset accuracy.""" + metric_q3 = TokenLevelAccuracy(3) + metric_q8 = TokenLevelAccuracy(8) + + for inputs, masks, d3labels, d8labels, _ in dataset: + logits1, logits2, _ = model_fn(inputs, masks) + metric_q3.update(logits1, d3labels) + metric_q8.update(logits2, d8labels) + + dataset.reset() + _m3acc = metric_q3.get() + _m8acc = metric_q8.get() + return _m3acc, _m8acc + + def eval_fn(self, dataset): + return HHblitsTask.__eval_fn(self.net, dataset) + + def eval_acc(self, eval_data_path): + eval_dataset = create_hhblits_dataset(eval_data_path, self.t5_tokenizer, self.train_conf.batch_size) + _m3acc, _m8acc = self.eval_fn(eval_dataset) + return _m3acc, _m8acc + + def token_level_crossentoryloss(self, logits, labels, num_classes, loss_fn): + activate_labels = labels.view(-1) + activate_logits = logits.view(-1, num_classes) + return loss_fn(activate_logits, activate_labels) + + def predict(self, data): + logits1, logits2, _ = self.net(*data) + softmax = ops.Softmax(axis=-1) + probabilities1 = softmax(logits1) + probabilities2 = softmax(logits2) + + # get token index of predict max probabilities + predicted_labels1 = ops.Argmax(axis=-1)(probabilities1).asnumpy() + predicted_labels2 = ops.Argmax(axis=-1)(probabilities2).asnumpy() + predicted_cates1 = map_label_to_category(predicted_labels1, HHBLITS_D3_LABEL_TO_CATE) + predicted_cates2 = map_label_to_category(predicted_labels2, HHBLITS_D8_LABEL_TO_CATE) + return predicted_cates1, predicted_cates2 + + def forward_fn(self, inputs, masks, d3labels, d8labels, disolabels): + """multitask loss""" + logits1, logits2, _ = self.net(inputs, masks) + loss1 = self.token_level_crossentoryloss( + logits1, d3labels, 3, self.loss_fn) + loss2 = self.token_level_crossentoryloss( + logits2, d8labels, 8, self.loss_fn) + return 0.5 * loss1 + 0.5 * loss2 + \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/protT5_downstream_tasks.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/protT5_downstream_tasks.py new file mode 100644 index 000000000..e0006478b --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/protT5_downstream_tasks.py @@ -0,0 +1,76 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""ProtT5 Downstream Task class implment.""" +from mindspore import jit + +from ...model import Model + +from .hhblits_task import HHblitsTask +from .deeploc_task import DeeplocTask + + +class ProtT5DownstreamTasks(Model): + '''ProtT5DownstreamTasks''' + + def __init__(self, config): + self.name = "ProtT5DownstreamTasks" + self.mixed_precision = False + + self.config = config + self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/protT5_xl.ckpt" + self.checkpoint_path = "./protT5_xl.ckpt" + + self.mode = config.mode + self.task_name = config.task_name + + if self.task_name == "hhblits": + self.network = HHblitsTask(config) + elif self.task_name == "deeploc": + self.network = DeeplocTask(config) + + super().__init__(self.checkpoint_url, self.checkpoint_path, self.network, self.name, None, + mixed_precision=self.mixed_precision) + + + def forward(self, data): + pass + + def backward(self, feat): + pass + + def predict(self, data): + return self.network.predict(data) + + def eval_acc(self, data_path): + if self.task_name == "hhblits": + m3acc, m8acc = self.network.eval_acc(data_path) + print("Accuracy Q3 %.4f; Q8 %.4f" % (m3acc, m8acc)) + elif self.task_name == "deeploc": + acc = self.network.eval_acc(data_path) + print("Accuracy %.4f" % acc) + + def train(self): + self.network.train() + + @jit + def train_step(self, data): + self.network.train_step(*data) + + @jit + def _jit_forward(self, data): + pass + + def _pynative_forward(self, data): + pass diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/task_datasets.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/task_datasets.py new file mode 100644 index 000000000..954cea99c --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/task_datasets.py @@ -0,0 +1,198 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""dataset loader of tasks.""" +import random +import re + +import numpy as np +import pandas as pd + +import mindspore.dataset as ds +from mindformers import T5Tokenizer + +from ..utils.utils import seqs_tokenizer +from ....dataset import DataSet + +LABEL_MASKER = -100 + +# Category Definitions in Data +MEMBRANE_CATES = {'M': 0, 'S': 1, 'U': 1} +LOC_CATES = { + 'Cell.membrane': 0, 'Cytoplasm': 1, 'Endoplasmic.reticulum': 2, + 'Golgi.apparatus': 3, 'Lysosome/Vacuole': 4, 'Mitochondrion': 5, + 'Nucleus': 6, 'Peroxisome': 7, 'Plastid': 8, 'Extracellular': 9 + } +HHBLITS_D3_CATES = {'C': 0, 'E': 1, 'H': 2} +HHBLITS_D8_CATES = {'C': 0, 'E': 1, 'H': 2, 'B': 3, 'G': 4, 'I': 5, 'S': 6, 'T': 7} + + +def reverse_dict(original_dict): + return {value: key for key, value in original_dict.items()} + +MEMBRANE_LABEL_TO_CATE = {0: 'M', 1: 'S'} +LOC_LABEL_TO_CATE = reverse_dict(LOC_CATES) +HHBLITS_D3_LABEL_TO_CATE = reverse_dict(HHBLITS_D3_CATES) +HHBLITS_D8_LABEL_TO_CATE = reverse_dict(HHBLITS_D8_CATES) + + +def map_label_to_category(labels, dct): + vectorized_map = np.vectorize(lambda label: dct.get(label, '')) + str_labels = vectorized_map(labels) + return str_labels + + +def seq2array(seq): + return np.array(seq).astype(np.int32) + + +def pad_trunc_addspecial(seq, max_length, sp=LABEL_MASKER, pad=0, add_special_tokens=True): + if len(seq) > max_length: + if add_special_tokens: + seq = seq[:max_length-1] + else: + seq = seq[:max_length] + + if add_special_tokens: + seq.append(sp) + + padded = seq + [pad] * max_length + return padded[:max_length] + + +def get_task_info(cate_name): + if cate_name == 'loc': + return LOC_CATES + elif cate_name == 'membrane': + return MEMBRANE_CATES + else: + return {} + + +def apply_tokenizer(text, tokenizer): + text = re.sub(r"[UZOB]", "X", text) + tokens = tokenizer(text, padding='max_length', truncation=True, add_special_tokens=True, max_length=512) + ids, masks = tokens['input_ids'], tokens['attention_mask'] + return ids, masks + + +def create_deeploc_dataset(file_path, tokenizer, batch_size=32, cate_name=''): + # read CSV file + df = pd.read_csv(file_path) + res = [] + df = df.rename(columns=lambda x: x.strip()) + + cate_dict = get_task_info(cate_name) + + for _, row in df.iterrows(): + text, cate = row['input'], row[cate_name] + ids, masks = apply_tokenizer(text, tokenizer) + eles = [seq2array(x) for x in [ids, masks, cate_dict.get(cate)]] + res.append(tuple(eles)) + + random.shuffle(res) + dataset = ds.GeneratorDataset(res, column_names=["inputs", "masks", "labels"]) + dataset = dataset.shuffle(buffer_size=128) + dataset = dataset.batch(batch_size=batch_size) + return dataset + + +def load_hhblits_dataset(path): + df = pd.read_csv(path, names=['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask'], skiprows=1) + df = df.rename(columns=lambda x: x.strip()) + + input_fixed = ["".join(seq.split()) for seq in df['input']] + input_fixed = [re.sub(r"[UZOB]", "X", seq) for seq in input_fixed] + seqs = [" ".join(seq) for seq in input_fixed] + + label_fixed3 = ["".join(label.split()) for label in df['dssp3']] + d3_labels = [list(label) for label in label_fixed3] + + label_fixed8 = ["".join(label.split()) for label in df['dssp8']] + d8_labels = [list(label) for label in label_fixed8] + + disorder_fixed = [" ".join(disorder.split()) for disorder in df['disorder']] + disorder = [disorder.split() for disorder in disorder_fixed] + + return seqs, d3_labels, d8_labels, disorder + + +def create_hhblits_dataset(file_path, tokenizer, batch_size=32): + # read CSV file + seqs, d3_labels, d8_labels, disorder = load_hhblits_dataset(file_path) + res = [] + for seq, d3, d8, diso in zip(seqs, d3_labels, d8_labels, disorder): + ids, masks = apply_tokenizer(seq, tokenizer) + + _d3 = [HHBLITS_D3_CATES[x.strip()] for x in d3] + _d3 = pad_trunc_addspecial(_d3, 512, pad=LABEL_MASKER) + + _d8 = [HHBLITS_D8_CATES[x.strip()] for x in d8] + _d8 = pad_trunc_addspecial(_d8, 512, pad=LABEL_MASKER) + + _diso = [int(float(x.strip())) for x in diso] + _diso = pad_trunc_addspecial(_diso, 512, pad=LABEL_MASKER) + + eles = [ids, masks, _d3, _d8, _diso] + eles_tp = [seq2array(x) for x in eles] + res.append(tuple(eles_tp)) + + random.shuffle(res) + dataset = ds.GeneratorDataset(res, column_names=["inputs", "masks", "d3labels", "d8labels", "disolabels"]) + dataset = dataset.shuffle(buffer_size=128) + dataset = dataset.batch(batch_size=batch_size) + return dataset + + +class ProtT5TaskDataSet(DataSet): + """ProtT5 downstream task dataSet""" + def __init__(self, config): + self.task_name = config.task_name + self.data_path = None + self.dataset = None + self.batch_size = config.train.batch_size + self.t5_tokenizer = T5Tokenizer.from_pretrained(config.t5_config_path) + self.phase = None + + # pylint: disable=E0302 + def __getitem__(self): + pass + + def __len__(self): + if self.dataset: + return self.dataset.get_dataset_size() + return 0 + + def set_phase(self, phase): + self.phase = phase + + def process(self, data, **kwargs): + return seqs_tokenizer(data, self.t5_tokenizer, return_tensors="ms") + + def set_training_data_src(self, data_source, **kwargs): + self.data_path = data_source + + def download(self, path=None): + pass + + def data_parse(self, idx): + pass + + def create_iterator(self, num_epochs, cate_name = ''): + if self.task_name == "hhblits": + self.dataset = create_hhblits_dataset(self.data_path, self.t5_tokenizer, self.batch_size) + else: + self.dataset = create_deeploc_dataset(self.data_path, self.t5_tokenizer, self.batch_size, cate_name=cate_name) + + return self.dataset diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/__init__.py new file mode 100644 index 000000000..175a0ece7 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Module""" \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/optimization.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/optimization.py new file mode 100644 index 000000000..68e8c2a66 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/optimization.py @@ -0,0 +1,55 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""create optimizer""" +import mindspore +from mindspore import ops, nn +from mindspore.nn.learning_rate_schedule import LearningRateSchedule + +from mindspore.nn.optim import AdaFactor + + +class WarmUpPolynomialDecayLR(LearningRateSchedule): + """Polynomia Decay LR with Warmup""" + def __init__(self, learning_rate, end_learning_rate, warmup_steps, decay_steps, power): + super().__init__() + self.learning_rate = learning_rate + self.warmup_steps = max(warmup_steps, 1) + self.end_learning_rate = end_learning_rate + self.decay_steps = decay_steps + self.power = power + + def construct(self, global_step): + # warmup lr + warmup_percent = global_step.astype(mindspore.float32) / self.warmup_steps + warmup_learning_rate = self.learning_rate * warmup_percent + # polynomial lr + global_step = ops.minimum(global_step, self.decay_steps) + decayed_learning_rate = (self.learning_rate - self.end_learning_rate) * \ + ops.pow((1 - global_step / self.decay_steps), self.power) + \ + self.end_learning_rate + is_warmup = (global_step < self.warmup_steps).astype(mindspore.float32) + learning_rate = ((1.0 - is_warmup) * decayed_learning_rate + is_warmup * warmup_learning_rate) + return learning_rate + + +def create_optimizer(model, init_lr, optim_type, weight_decay=0.0): + if optim_type == 'adafactor': + optim = AdaFactor(model.trainable_params()) + elif weight_decay > 0: + optim = nn.AdamWeightDecay(model.trainable_params(), init_lr, weight_decay=weight_decay) + else: + optim = nn.Adam(model.trainable_params(), learning_rate=init_lr) + + return optim \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_configuration.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_configuration.py new file mode 100644 index 000000000..1900dc847 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_configuration.py @@ -0,0 +1,23 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"protT5 pretrain task configure" +protT5pretrain_configuration = { + "protT5_base": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml", + "protT5downtask_predict": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml", + "protT5downtask_train": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml" +} \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_dataloader.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_dataloader.py new file mode 100644 index 000000000..9d3f1cc9b --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_dataloader.py @@ -0,0 +1,116 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""data loader of mindreord files; add mask, pad and bacth process.""" +import os + +import numpy as np +import mindspore.dataset as ds +from mindformers import T5Tokenizer + +from ..utils.utils import seqs_tokenizer +from ....dataset import DataSet + +MASK_TOKEN_ID = 33 +PAD_INDEX = 0 + + +class EncDecIds(): + """return `input_ids', 'masks', 'decode_ids'.""" + def __init__(self, mask_prob): + super().__init__() + self.mask_prob = mask_prob + + def __call__(self, raw_ids): + decode_ids = np.array(raw_ids) + mask_ids = np.ones_like(raw_ids) + + np.random.seed(23) + # determine which positions should be masked + random_mask = np.random.rand(len(raw_ids)) < self.mask_prob + random_mask[len(raw_ids) - 1] = False + raw_ids[random_mask] = MASK_TOKEN_ID + + return (np.array(raw_ids), mask_ids, decode_ids) + + +def find_mindrecord_files(directory): + files = os.listdir(directory) + mindrecord_files = [os.path.join(directory, f) for f in files if f.endswith('.mindrecord')] + return mindrecord_files + + +def create_pretrain_dataset(mr_files, batch_size, epochs, rank_size=0, rank_id=0): + if rank_size > 0: + dataset = ds.MindDataset(dataset_files=mr_files, columns_list=["raw_ids"], + num_shards=rank_size, shard_id=rank_id, shuffle=True) + else: + dataset = ds.MindDataset(dataset_files=mr_files, columns_list=["raw_ids"], shuffle=True) + dataset = dataset.map(operations=EncDecIds(0.15), input_columns=["raw_ids"], + output_columns=["input_ids", "masks", "decode_ids"]) + + # default: pad = 0 + padding_shape = ([512], 0) + pad_info = {"input_ids": padding_shape, "masks": padding_shape, "decode_ids": padding_shape} + dataset = dataset.padded_batch(batch_size=batch_size, drop_remainder=True, pad_info=pad_info) + dataset = dataset.repeat(epochs) + return dataset + + +class ProtT5TrainDataSet(DataSet): + """ProtT5 downstream task dataSet""" + def __init__(self, config): + self.batch_size = config.train.batch_size + self.data_path = None + self.dataset = None + self.phase = None + self.t5_config_path = config.t5_config_path + self.tokenizer = None + + # pylint: disable=E0302 + def __getitem__(self): + pass + + def __len__(self): + if self.dataset: + return self.dataset.get_dataset_size() + return 0 + + def set_phase(self, phase): + self.phase = phase + + def process(self, data, mode="embedding"): + re_type = "ms" + if mode == "generate": + re_type = "np" + + if not self.tokenizer: + self.tokenizer = T5Tokenizer.from_pretrained(self.t5_config_path) + return seqs_tokenizer(data, self.tokenizer, return_tensors=re_type) + + def set_training_data_src(self, data_source, **kwargs): + self.data_path = data_source + + def download(self, path=None): + pass + + def data_parse(self, idx): + pass + + def create_iterator(self, num_epochs, rank_size=0, rank_id=0): + mr_files = find_mindrecord_files(self.data_path) + self.dataset = create_pretrain_dataset(mr_files, self.batch_size, num_epochs, rank_size=rank_size, rank_id=rank_id) + data_loader = self.dataset.create_tuple_iterator() + return data_loader + diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_protT5.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_protT5.py new file mode 100644 index 000000000..23c51322b --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_protT5.py @@ -0,0 +1,214 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""ProtT5 Trainer with data parallel.""" +import time +import os + +import mindspore as ms +from mindspore import nn, value_and_grad +from mindspore.amp import all_finite +from mindspore.ops import functional as F +from mindspore.parallel._utils import _get_device_num, _get_gradients_mean +from mindspore.communication import init, get_rank, get_group_size +from mindformers.core.clip_grad import ClipGradNorm +from mindformers.tools.logger import get_logger +from mindformers import T5Tokenizer + +from .optimization import create_optimizer, WarmUpPolynomialDecayLR +from .pretrain_dataloader import create_pretrain_dataset, find_mindrecord_files +from .t5_modeling import create_model +from ..utils.utils import generate_checkpoint_filename +from ...model import Model + +PRINT_ITERS = 10 +logger = get_logger(logger_name='Pretrain') + + +class ProtT5(Model): + """ProtT5""" + name = "ProtT5" + + def __init__(self, config): + self.mixed_precision = False + self.config = config + self.use_parallel = config.parallel + self.rank_id = 0 + self.rank_size = 1 + self.init_context() + + self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/protT5_xl.ckpt" + self.checkpoint_path = "./protT5_xl.ckpt" + self.mode = config.mode + self.train_conf = config.train + + if self.mode == "train": + self.network = create_model(config.t5_config_path, config.load_model_path) + self.init_trainer() + else: + self.tokenizer = T5Tokenizer.from_pretrained(config.t5_config_path) + self.network = create_model(config.t5_config_path, from_pretrained=True) + + super().__init__(self.checkpoint_url, self.checkpoint_path, self.network, self.name, None, + mixed_precision=self.mixed_precision) + + + def init_context(self): + if self.use_parallel: + init() + self.rank_id = get_rank() + self.rank_size = get_group_size() + ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend", device_id=self.rank_id) + ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, parameter_broadcast=True, + device_num=self.rank_size, gradients_mean=True) + + else: + ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + + + def init_trainer(self): + if self.train_conf.save_ckpt_path: + os.makedirs(self.train_conf.save_ckpt_path, exist_ok=True) + + # data_loader + dataset_path = find_mindrecord_files(self.train_conf.train_data_path) + train_dataset = create_pretrain_dataset(dataset_path, self.train_conf.batch_size, self.train_conf.epochs, + rank_size=self.rank_size, rank_id=self.rank_id) + + self.train_dataloader = train_dataset.create_tuple_iterator() + num_train_steps = train_dataset.get_dataset_size() + + # grad clip + self.use_clip_grad = False + if self.train_conf.use_clip_grad: + self.use_clip_grad = True + self.clip_grad_norm = ClipGradNorm(max_norm=self.train_conf.max_grad_norm) + + # trick: warm up + if self.train_conf.warmup_steps > 0: + lr = WarmUpPolynomialDecayLR(self.train_conf.lr, 0.0, self.train_conf.warmup_steps, num_train_steps, 1.0) + else: + lr = self.train_conf.lr + + # Define optimizer. + self.optimizer = create_optimizer(self.network, lr, 'adam', weight_decay=0) + + # data parall + if self.use_parallel: + degree = _get_device_num() + mean = _get_gradients_mean() + self.grad_reducer = nn.DistributedGradReducer(self.optimizer.parameters, mean, degree) + + weights = self.network.trainable_params() + self.grad_fn = value_and_grad(self.forward_fn, None, weights, has_aux=False) + + def forward_fn(self, input_ids, input_mask, decode_ids): + loss = self.network(input_ids, input_mask, decode_ids) + return loss + + def save_checkpoint(self, train_step_nums): + if self.rank_id == 0: + filename = generate_checkpoint_filename(self.train_conf.save_ckpt_path, train_step_nums) + ms.save_checkpoint(self.network, filename) + + def train(self): + # train begin + loss_total = 0 + cur_step_nums, train_step_nums, skip_step_nums = 0, 0, 0 + cur_time, avg_time = time.time(), 0 + + # step begin + self.network.set_train(True) + + for input_ids, input_mask, decode_ids in self.train_dataloader: + loss, is_finite = self._train_step(input_ids, input_mask, decode_ids) + if is_finite: + loss_total = loss_total + loss.asnumpy().item() + train_step_nums += 1 + else: + logger.warning(f"grads overflow, skip step {cur_step_nums}; loss: {loss}") + skip_step_nums += 1 + + if train_step_nums % PRINT_ITERS == 0 and train_step_nums != 0: + print_time = time.time() + total_time = print_time - cur_time + cur_time = print_time + avg_time = total_time / (PRINT_ITERS + skip_step_nums) + + logger.info(f"avg_time(ms): {avg_time * 1000:2f}, " + f"cur_step: {cur_step_nums}, " + f"skip_steps: {skip_step_nums:3d}, " + f"train_step: {train_step_nums}, " + f"loss: {loss_total/PRINT_ITERS:f}, ") + + loss_total = 0 + skip_step_nums = 0 + + # saving ckpt per N steps or last step + if train_step_nums % self.train_conf.save_steps == 0: + self.save_checkpoint(train_step_nums) + + cur_step_nums += 1 + + self.save_checkpoint(train_step_nums) + logger.info("Pretrain done!") + + + @ms.jit + def _train_step(self, input_ids, input_mask, decode_ids): + loss, grads = self.grad_fn(input_ids, input_mask, decode_ids) + + if self.use_parallel: + grads = self.grad_reducer(grads) + + is_finite = all_finite(grads) + + if is_finite: + # Apply gradient clipping + if self.use_clip_grad: + grads, _ = self.clip_grad_norm(grads) + + loss = F.depend(loss, self.optimizer(grads)) + + return loss, is_finite + + def train_step(self, data): + return self._train_step(*data) + + def predict(self, data, mode="embedding"): + self.network.set_train(False) + token_ids, attention_mask = data + if mode == "generate": + """Generate the sequence of input texts.""" + output_ids = self.network.generate(token_ids, do_sample=False) + output_tokens = self.tokenizer.decode(output_ids, skip_special_tokens=True) + return output_tokens + elif mode == "embedding": + """Embedding of the final layer of encoder""" + outputs = self.network.encoder_forward(token_ids, attention_mask) + hiddens = outputs.asnumpy() + return hiddens + + + def forward(self, data): + pass + + def backward(self, data): + pass + + def _jit_forward(self, data): + pass + + def _pynative_forward(self, data): + pass \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/t5_modeling.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/t5_modeling.py new file mode 100644 index 000000000..214b45302 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/t5_modeling.py @@ -0,0 +1,97 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""T5 model with initial method.""" +import logging + +import mindspore +from mindspore.common.initializer import initializer, Constant, Normal + +from mindformers import AutoConfig +from mindformers import T5ForConditionalGeneration as T5WithLoss +from mindformers.modules.transformer.transformer import default_transformer_config, TransformerOpParallelConfig + +logger = logging.getLogger(__name__) + + +def set_data(weight, init_distribution): + weight.set_data(initializer(init_distribution, weight.shape, weight.dtype)) + + +def init_cell(cell, name, config): + factor = config.initializer_factor + if "layernorm" in name: + set_data(cell.gamma, Constant(factor * 1.0)) + elif "tfm_embedding_lookup" in name: + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + set_data(cell.embedding_table, Normal(factor * 1.0)) + elif name.endswith("output"): + # default these cells has no bias + set_data(cell.mapping.weight, Normal(factor * ((config.hidden_size) ** -0.5))) + set_data(cell.projection.weight, Normal(factor * ((config.d_ff) ** -0.5))) + elif name.endswith("attention"): + d_model = config.hidden_size + key_value_proj_dim = config.kv_size + n_heads = config.num_heads + + # q, k, v, o parameter + set_data(cell.dense1.weight, Normal(factor * ((d_model * key_value_proj_dim) ** -0.5))) + set_data(cell.dense2.weight, Normal(factor * (d_model**-0.5))) + set_data(cell.dense3.weight, Normal(factor * (d_model**-0.5))) + set_data(cell.projection.weight, Normal(factor * ((n_heads * key_value_proj_dim) ** -0.5))) + + if cell.has_relative_bias and cell.is_cross_atten: + set_data(cell.cross_bias, Normal(factor * (d_model**-0.5))) + + +def init_t5_weights(cell, config, prefix=''): + if hasattr(cell, 'add_name'): + return + + cell.add_flags(add_name=prefix) + init_cell(cell, prefix, config) + + for name, sub_cell in cell.cells_and_names(): + hier_name = prefix + "." + name + init_t5_weights(sub_cell, config, prefix=hier_name) + + +def trans_to_transformer_config(parallel_config): + if not parallel_config: + return default_transformer_config + + return TransformerOpParallelConfig(**parallel_config) + + +def create_model(config_path, load_model_path=None, parallel_config=None, from_pretrained=False): + if from_pretrained: + return T5WithLoss.from_pretrained(config_path) + + base_config = AutoConfig.from_pretrained(config_path) + base_config.parallel_config = trans_to_transformer_config(parallel_config) + model = T5WithLoss(base_config) + + if load_model_path: + # load from checkpoint path + param_dict = mindspore.load_checkpoint(load_model_path) + mindspore.load_param_into_net(model, param_dict) + logger.info("pretrain: load ckpt successful") + else: + # init T5 + init_t5_weights(model, base_config, prefix="") + logger.info("pretrain: inited successful") + + return model + diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/convert_weight.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/convert_weight.py new file mode 100644 index 000000000..3ce778d66 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/convert_weight.py @@ -0,0 +1,180 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Convert checkpoint from torch/huggingface: source from mindformers""" +import argparse +import numpy as np +import torch +from mindspore import save_checkpoint, Tensor + + +def generate_params_dict(total_layers, + mindspore_params_per_layer, + torch_params_per_layer, + mindspore_additional_params, + torch_additional_params): + """ + Generate the total parameter mapping of mindspore and pytorch. + + Args: + total_layers(int): The total layers of the net. + mindspore_params_per_layer(list): The list of params per layer for the net of mindspore. + torch_params_per_layer(list): The list of params per layer for the net of pytorch. + mindspore_additional_params(list): The list of params outside the layer for the net of mindspore + torch_additional_params(list): The list of params outside the layer for the net of pytorch. + + Returns: + A list of tuple. The first element is the parameter name of mindspore, + the another is the parameter name of pytorch. + """ + mapped_params = list(zip(mindspore_params_per_layer, torch_params_per_layer)) + ms_extend_param_list = [] + torch_extend_param_list = [] + for i in range(total_layers): + for ms_para, torch_para in mapped_params: + src = ms_para.format(i) + tgt = torch_para.format(i) + + ms_extend_param_list.append(src) + torch_extend_param_list.append(tgt) + + mapped_params = list(zip(mindspore_additional_params, torch_additional_params)) + for ms_para, torch_para in mapped_params: + ms_extend_param_list.append(ms_para) + torch_extend_param_list.append(torch_para) + + return list(zip(ms_extend_param_list, torch_extend_param_list)) + +def get_converted_ckpt(mapped_params, weight_dict): + """ + Print the keys of the loaded checkpoint + + Args: + mapped_params(dict): The loaded checkpoint. The key is parameter name and value is the numpy array. + weight_dict(dict): The loaded pytorch checkpoint. + + Returns: + None + """ + new_ckpt_list = [] + # Currently, the ms_extend_param the torch_extend_param is the full parameters. + for src, tgt in mapped_params: + value = weight_dict[tgt].numpy() + is_transpose = "" + if '.o.' in tgt or '.wi.' in tgt or '.wo.' in tgt: + value = np.transpose(value, [1, 0]) + is_transpose = " transposed" + print(f"Mapping table Mindspore:{src:<30} \t Torch:{tgt:<30} with shape {value.shape}" + f"---{is_transpose}") + new_ckpt_list.append({"data": Tensor(value), "name": src}) + return new_ckpt_list + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="OPT convert script") + parser.add_argument('--layers', + type=int, + default=1, + help="The number of layers of the model to be converted.") + parser.add_argument("--torch_path", + type=str, + default=None, + required=True, + help="The torch checkpoint path.") + parser.add_argument("--mindspore_path", + type=str, + required=True, + default="The output mindspore checkpoint path.", + help="Use device nums, default is 128.") + + opt = parser.parse_args() + state_dict = torch.load(opt.torch_path, map_location='cpu') + + ms_name = [ + "t5_model.tfm_encoder.blocks.{}.layernorm1.gamma", + "t5_model.tfm_encoder.blocks.{}.layernorm2.gamma", + "t5_model.tfm_encoder.blocks.{}.attention.dense1.weight", + "t5_model.tfm_encoder.blocks.{}.attention.dense2.weight", + "t5_model.tfm_encoder.blocks.{}.attention.dense3.weight", + "t5_model.tfm_encoder.blocks.{}.attention.projection.weight", + "t5_model.tfm_encoder.blocks.{}.output.mapping.weight", + "t5_model.tfm_encoder.blocks.{}.output.projection.weight", + + "t5_model.tfm_decoder.blocks.{}.layernorm1.gamma", + "t5_model.tfm_decoder.blocks.{}.cross_attention_layernorm.gamma", + "t5_model.tfm_decoder.blocks.{}.layernorm2.gamma", + "t5_model.tfm_decoder.blocks.{}.attention.dense1.weight", + "t5_model.tfm_decoder.blocks.{}.attention.dense2.weight", + "t5_model.tfm_decoder.blocks.{}.attention.dense3.weight", + "t5_model.tfm_decoder.blocks.{}.attention.projection.weight", + + "t5_model.tfm_decoder.blocks.{}.cross_attention.dense1.weight", + "t5_model.tfm_decoder.blocks.{}.cross_attention.dense2.weight", + "t5_model.tfm_decoder.blocks.{}.cross_attention.dense3.weight", + "t5_model.tfm_decoder.blocks.{}.cross_attention.projection.weight", + "t5_model.tfm_decoder.blocks.{}.output.mapping.weight", + "t5_model.tfm_decoder.blocks.{}.output.projection.weight", + ] + + torch_name = [ + "encoder.block.{}.layer.0.layer_norm.weight", + "encoder.block.{}.layer.1.layer_norm.weight", + "encoder.block.{}.layer.0.SelfAttention.q.weight", + "encoder.block.{}.layer.0.SelfAttention.k.weight", + "encoder.block.{}.layer.0.SelfAttention.v.weight", + "encoder.block.{}.layer.0.SelfAttention.o.weight", + "encoder.block.{}.layer.1.DenseReluDense.wi.weight", + "encoder.block.{}.layer.1.DenseReluDense.wo.weight", + + "decoder.block.{}.layer.0.layer_norm.weight", + "decoder.block.{}.layer.1.layer_norm.weight", + "decoder.block.{}.layer.2.layer_norm.weight", + "decoder.block.{}.layer.0.SelfAttention.q.weight", + "decoder.block.{}.layer.0.SelfAttention.k.weight", + "decoder.block.{}.layer.0.SelfAttention.v.weight", + "decoder.block.{}.layer.0.SelfAttention.o.weight", + + "decoder.block.{}.layer.1.EncDecAttention.q.weight", + "decoder.block.{}.layer.1.EncDecAttention.k.weight", + "decoder.block.{}.layer.1.EncDecAttention.v.weight", + "decoder.block.{}.layer.1.EncDecAttention.o.weight", + + "decoder.block.{}.layer.2.DenseReluDense.wi.weight", + "decoder.block.{}.layer.2.DenseReluDense.wo.weight", + ] + + addition_mindspore = [ + "t5_model.encoder_layernorm.gamma", + "t5_model.decoder_layernorm.gamma", + "t5_model.tfm_embedding_lookup.embedding_table", + "t5_model.tfm_encoder.blocks.0.attention.bias_generator.embeddings_table", + "t5_model.tfm_decoder.blocks.0.attention.bias_generator.embeddings_table", + ] + + addition_torch = [ + "encoder.final_layer_norm.weight", + "decoder.final_layer_norm.weight", + "shared.weight", + "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight", + "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight", + ] + + + mapped_param = generate_params_dict(total_layers=opt.layers, + mindspore_params_per_layer=ms_name, + torch_params_per_layer=torch_name, + mindspore_additional_params=addition_mindspore, + torch_additional_params=addition_torch) + new_ckpt = get_converted_ckpt(mapped_param, state_dict) + save_checkpoint(new_ckpt, opt.mindspore_path) + print(f"Convert finished, the output is saved to {opt.mindspore_path}") diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/trans_csv_to_mindrecord.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/trans_csv_to_mindrecord.py new file mode 100644 index 000000000..b92711458 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/trans_csv_to_mindrecord.py @@ -0,0 +1,114 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Read the csv files; transform to mindrecord files to train model.""" +import argparse +import os +import csv +import re + +import numpy as np + +from mindspore.mindrecord import FileWriter +from mindformers import T5Tokenizer + + +# mindrecord schema; data format +SCHEMA = {"raw_ids": {"type": "int32", "shape": [-1]}} +MAX_TOKENS_PER_FILE = 400 * 1024 * 1024 // 4 +STEP_PRINT_NUM = 10000 +STEP_SAVE_NUM = 1000 +file_index = 0 + + +def process_df(row, tokenizer): + text = re.sub(r"[UZOB]", "X", row['text']) + tokens = tokenizer(" ".join(text), truncation=True, add_special_tokens=True, max_length=512) + token_ids = tokens['input_ids'] + + sample = { + "raw_ids": np.array(token_ids, dtype=np.int32) + } + return sample, len(token_ids) + + +def get_writer(output_dir): + global file_index + file_name = os.path.join(output_dir, f"data_{file_index}.mindrecord") + writer = FileWriter(file_name, shard_num=1, overwrite=True) + writer.add_schema(SCHEMA, "mindrecord_schema") + file_index += 1 + return writer + + +def converse_file(csv_file_path, num_samples, output_dir, tokenizer): + """read CSV file and transform to mindRecord files.""" + data = [] + current_file_size = 0 + with open(csv_file_path, newline='') as csvfile: + index = 0 + writer = get_writer(output_dir) + reader = csv.DictReader(csvfile) + for row in reader: + index += 1 + if num_samples > 0 and index > num_samples: + break + + if current_file_size > MAX_TOKENS_PER_FILE: + writer.commit() + writer = get_writer(output_dir) + current_file_size = 0 + + sample, token_length = process_df(row, tokenizer) + + # compute current file size + current_file_size += 4 * token_length + data.append(sample) + + if index % STEP_PRINT_NUM == 0: + print(f"Samples {index} Done") + + if index % STEP_SAVE_NUM == 0: + writer.write_raw_data(data) + data = [] + + if data: + writer.write_raw_data(data) + + writer.commit() + + +def run(file_path, num_samples, output_dir): + tokenizer = T5Tokenizer.from_pretrained(args.t5_config_path) + if os.path.isfile(file_path) and file_path.endswith('csv'): + converse_file(file_path, num_samples, output_dir, tokenizer) + else: + csv_files = [os.path.join(file_path, filename) for filename in os.listdir(file_path) if filename.endswith('.csv')] + for cfile in csv_files: + converse_file(cfile, num_samples, output_dir, tokenizer) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--number_samples", default=-1, type=int, + help="Choose maxium process data sample number.") + parser.add_argument("--data_dir", type=str, required=True, + help="Data path to converse to mindrecords; it can file or dir.") + parser.add_argument("--output_dir", type=str, required=True, help="Data path of output.") + parser.add_argument('--t5_config_path', type=str, required=True, help='model name or t5 config path') + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + run(args.data_dir, args.number_samples, args.output_dir) \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/utils/utils.py b/MindSPONGE/src/mindsponge/pipeline/models/protT5/utils/utils.py new file mode 100644 index 000000000..b1a90f23b --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/protT5/utils/utils.py @@ -0,0 +1,34 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +import datetime +import re + + +def generate_checkpoint_filename(checkpoint_dir, model_info): + """get datatime of now to generate filename.""" + now = datetime.datetime.now() + timestamp = now.strftime('%Y%m%d_%H%M%S') + filename = f'model_{model_info}_{timestamp}.ckpt' + filepath = os.path.join(checkpoint_dir, filename) + + return filepath + + +def seqs_tokenizer(sequences, tokenizer, return_tensors=None): + # data preprocess; UZOB is rare which are replaced in ProtT5 model + sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences] + tokens = tokenizer(sequences, padding=True, add_special_tokens=True, return_tensors=return_tensors) + return (tokens['input_ids'], tokens["attention_mask"]) \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/pipeline.py b/MindSPONGE/src/mindsponge/pipeline/pipeline.py index 8efdf2752..65c7901f7 100644 --- a/MindSPONGE/src/mindsponge/pipeline/pipeline.py +++ b/MindSPONGE/src/mindsponge/pipeline/pipeline.py @@ -35,6 +35,8 @@ from .models import RASP, RASPDataSet, rasp_configuration from .models import Multimer, MultimerDataSet, multimer_configuration from .models import ProteinMpnn, ProteinMpnnDataset, proteinmpnn_configuration from .models import UFold, UFoldDataSet, ufold_configuration +from .models import ProtT5, ProtT5TrainDataSet, protT5pretrain_configuration +from .models import ProtT5DownstreamTasks, ProtT5TaskDataSet, protT5downtask_configuration model_card = { @@ -53,6 +55,8 @@ model_card = { "Proteinmpnn": {"model": ProteinMpnn, "dataset": ProteinMpnnDataset, "config": proteinmpnn_configuration}, "RASP": {"model": RASP, "dataset": RASPDataSet, "config": rasp_configuration}, "UFold": {"model": UFold, "dataset": UFoldDataSet, "config": ufold_configuration}, + "ProtT5": {"model": ProtT5, "dataset": ProtT5TrainDataSet, "config": protT5pretrain_configuration}, + "ProtT5Downstream": {"model": ProtT5DownstreamTasks, "dataset": ProtT5TaskDataSet, "config": protT5downtask_configuration} } -- Gitee From fba3ca2c26c45f78ae7d7a17fa5896a32e32a8f2 Mon Sep 17 00:00:00 2001 From: "L.L" <920817420@qq.com> Date: Wed, 23 Oct 2024 16:09:04 +0800 Subject: [PATCH 2/6] fix --- MindSPONGE/applications/model_cards/ProtT5.md | 17 ++++-- .../mindsponge/pipeline/models/__init__.py | 4 +- .../models/{protT5 => prot_t5}/__init__.py | 9 ++-- .../downstream/__init__.py | 2 +- .../downstream/deeploc_task.py | 13 +++-- .../downstream/downstream_configuration.py | 6 +-- .../downstream/downstream_nets.py | 22 ++++---- .../downstream/downstream_task.py | 6 ++- .../downstream/hhblits_task.py | 15 +++--- .../downstream/prott5_downstream_tasks.py} | 6 ++- .../downstream/task_datasets.py | 52 +++++++++++-------- .../{protT5 => prot_t5}/pretrain/__init__.py | 2 +- .../pretrain/optimization.py | 6 ++- .../pretrain/pretrain_configuration.py | 4 +- .../pretrain/pretrain_dataloader.py | 9 ++-- .../pretrain/pretrain_prott5.py} | 35 ++++++++----- .../pretrain/t5_modeling.py | 10 ++-- .../scripts/convert_weight.py | 0 .../scripts/trans_csv_to_mindrecord.py | 5 +- .../models/{protT5 => prot_t5}/utils/utils.py | 2 +- .../src/mindsponge/pipeline/pipeline.py | 9 ++-- 21 files changed, 145 insertions(+), 89 deletions(-) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/__init__.py (75%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/downstream/__init__.py (98%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/downstream/deeploc_task.py (93%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/downstream/downstream_configuration.py (94%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/downstream/downstream_nets.py (86%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/downstream/downstream_task.py (97%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/downstream/hhblits_task.py (93%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5/downstream/protT5_downstream_tasks.py => prot_t5/downstream/prott5_downstream_tasks.py} (95%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/downstream/task_datasets.py (83%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/pretrain/__init__.py (98%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/pretrain/optimization.py (97%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/pretrain/pretrain_configuration.py (97%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/pretrain/pretrain_dataloader.py (96%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5/pretrain/pretrain_protT5.py => prot_t5/pretrain/pretrain_prott5.py} (92%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/pretrain/t5_modeling.py (96%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/scripts/convert_weight.py (100%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/scripts/trans_csv_to_mindrecord.py (96%) rename MindSPONGE/src/mindsponge/pipeline/models/{protT5 => prot_t5}/utils/utils.py (93%) diff --git a/MindSPONGE/applications/model_cards/ProtT5.md b/MindSPONGE/applications/model_cards/ProtT5.md index 2061e55b6..34dbf9204 100644 --- a/MindSPONGE/applications/model_cards/ProtT5.md +++ b/MindSPONGE/applications/model_cards/ProtT5.md @@ -1,6 +1,7 @@ # ProtT5 ## 模型介绍 + 计算生物学和生物信息学从蛋白质序列中获得了大量的数据,非常适合使用自然语言处理中的语言模型。这些语言模型以低推理成本达到了新的预测效果。ProtTrans提供了先进的预训练模型用于蛋白质研究。其中,ProtT5是项目中多个预训练模型中,效果最好的。 详细信息见项目主页及论文:[github](https://github.com/agemagician/ProtTrans) @@ -8,15 +9,18 @@ ProtTrans论文中有两类下游任务,预测蛋白质相关性质和氨基酸的性质,对应的分别是sample level和token level的分类模型; 实验具体信息可以参考作者论文和项目主页中的描述。 下游任务实验的训练数据, 评测数据都可以从项目主页中提供的链接下载。 - ### 模型权重获取 + 可以从如下链接中下载torch版本的checkpoint;[模型页面](https://huggingface.co/Rostlab/prot_t5_xl_uniref50)。 然后使用`protT5/scripts`文件夹中的`convert_weight.py`脚本转换为mindspore支持的格式, 脚本使用方式如下: + ```shell python scripts/convert_weight.py --layers 24 --torch_path pytorch_model.bin --mindspore_path ./mindspore_t5.ckpt ``` + 转换完成后,需要添加yaml格式的配置文件; 具体可以参考: `model_configs/ProtT5/t5_xl.yaml`; 直接把这个文件复制到相应的目录中也可以; 配置文件参数的含义可以参考mindformers中t5_config定义。 这些文件在`MindSPONGE`项目下。 - 文件结构 + ```bash # checkpoint文件组织格式如下 └── prot_t5_xl_uniref50 @@ -30,11 +34,13 @@ python scripts/convert_weight.py --layers 24 --torch_path pytorch_model.bin --mi ## 如何使用 ### Dependencies + * mindspore >= 2.3.0 * mindformers >= 1.2.0 * sentencepiece >= 0.2.0 ### ProtT5预测 + ```bash from mindsponge import PipeLine @@ -62,6 +68,7 @@ print("Embedding:", res) ``` ### ProtT5预训练 + ```bash # 单卡; 按照配置文件配置好yaml文件 config_path = 'configs/t5_pretrain.yaml' # 根据需要改为本地路径 @@ -74,8 +81,8 @@ pipe.model.train() msrun --worker_num=${worknum} --local_worker_num=${worknum} --master_port=8128 --log_dir=msrun_log --join=True --cluster_time_out=600 ./run_pretrain.py ``` - ### 下游任务 + ```bash import mindspore as ms from mindsponge import PipeLine @@ -107,11 +114,13 @@ pipe.model.train() ``` ### 预训练说明 + ProtTrans主要工作是在蛋白质氨基酸序列上训练的预训练模型, 下面是模型训练相关一些说明。 - 数据转换 为了训练效率,首先需要原始数据转换成mindrecord格式。原始的预训练数据可以使用`uniref50`数据, 下面是数据转换脚本的路径及其使用方式。`number_samples`指定了想转换的样本数量,默认是`-1`转换全部数据。 + ```shell # 参数分别是: 原始csv数据目录; 转换后的目录; 模型checkpoint路径; 转换样本数 python scripts/trans_csv_to_mindrecord.py --data_dir ../unif50 --output_dir ../unif50_mindrecord --t5_config_path ../prot_t5_xl_uniref50 --number_samples 50000 @@ -120,6 +129,7 @@ python scripts/trans_csv_to_mindrecord.py --data_dir ../unif50 --output_dir ../u - T5参数配置 除了网络中每层的参数量和dropout比例,下面几个参数也需要注意 + ```yaml # 初始化权重缩放比例; 一般小于等于1 initializer_factor: 1.0 @@ -132,14 +142,13 @@ softmax_compute_type: "float32" compute_dtype: "float32" ``` - ## 引用 ```bash @article{9477085, author={Elnaggar, Ahmed and Heinzinger, Michael and Dallago, Christian and Rehawi, Ghalia and Yu, Wang and Jones, Llion and Gibbs, Tom and Feher, Tamas and Angerer, Christoph and Steinegger, Martin and Bhowmik, Debsindhu and Rost, Burkhard}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, - title={ProtTrans: Towards Cracking the Language of Lifes Code Through Self-Supervised Deep Learning and High Performance Computing}, + title={ProtTrans: Towards Cracking the Language of Lives Code Through Self-Supervised Deep Learning and High Performance Computing}, year={2021}, volume={}, number={}, diff --git a/MindSPONGE/src/mindsponge/pipeline/models/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/__init__.py index d3e15ebc7..3a01e8077 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/__init__.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/__init__.py @@ -35,5 +35,5 @@ from .multimer import Multimer, MultimerDataSet, multimer_configuration from .proteinmpnn import ProteinMpnn, ProteinMpnnDataset, proteinmpnn_configuration from .ufold import UFold, UFoldDataSet, ufold_configuration from .rasp import RASP, RASPDataSet, rasp_configuration -from .protT5 import ProtT5, ProtT5TrainDataSet, protT5pretrain_configuration -from .protT5 import ProtT5DownstreamTasks, ProtT5TaskDataSet, protT5downtask_configuration \ No newline at end of file +from .prot_t5 import ProtT5, ProtT5TrainDataSet, prott5pretrain_configuration +from .prot_t5 import ProtT5DownstreamTasks, ProtT5TaskDataSet, prott5downtask_configuration diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/__init__.py similarity index 75% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/__init__.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/__init__.py index 06f0d432d..43112e748 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/__init__.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/__init__.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -from .pretrain.pretrain_protT5 import ProtT5 +"""Prot T5""" +from .pretrain.pretrain_prott5 import ProtT5 from .pretrain.pretrain_dataloader import ProtT5TrainDataSet -from .pretrain.pretrain_configuration import protT5pretrain_configuration +from .pretrain.pretrain_configuration import prott5pretrain_configuration -from .downstream.protT5_downstream_tasks import ProtT5DownstreamTasks +from .downstream.prott5_downstream_tasks import ProtT5DownstreamTasks from .downstream.task_datasets import ProtT5TaskDataSet -from .downstream.downstream_configuration import protT5downtask_configuration \ No newline at end of file +from .downstream.downstream_configuration import prott5downtask_configuration diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/__init__.py similarity index 98% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/__init__.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/__init__.py index 175a0ece7..e8806c147 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/__init__.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Module""" \ No newline at end of file +"""Module""" diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/deeploc_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py similarity index 93% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/deeploc_task.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py index 462c74cc4..fddc063bd 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/deeploc_task.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py @@ -49,7 +49,7 @@ class DeeplocTask(BaseTask): self.net.load_from_pretrained(self.checkpoint_path) if self.train_conf.train_data_path: - self.train_dataset = create_deeploc_dataset(self.train_conf.train_data_path, + self.train_dataset = create_deeploc_dataset(self.train_conf.train_data_path, \ self.t5_tokenizer, batch_size=self.train_conf.batch_size, cate_name=self.cate_name) self.loss_fn = nn.CrossEntropyLoss(reduction='mean') batch_num = self.train_dataset.get_dataset_size() @@ -57,11 +57,12 @@ class DeeplocTask(BaseTask): self.optimizer = nn.Adam(self.net.trainable_params(), learning_rate=learning_rate) if self.train_conf.eval_data_path: - self.eval_dataset = create_deeploc_dataset(self.train_conf.eval_data_path, + self.eval_dataset = create_deeploc_dataset(self.train_conf.eval_data_path, \ self.t5_tokenizer, batch_size=self.train_conf.batch_size, cate_name=self.cate_name) @staticmethod def __eval_fn(model_fn, dataset): + """eval give dataset with model; staticmethod""" metric = nn.Accuracy('classification') metric.clear() for inputs, masks, targets in dataset: @@ -73,22 +74,26 @@ class DeeplocTask(BaseTask): return accuracy def eval_fn(self, dataset): + """eval dataset""" return DeeplocTask.__eval_fn(self.net, dataset) def forward_fn(self, inputs, masks, targets): + """forward loss""" logits = self.net(inputs, masks) loss = self.loss_fn(logits, targets) return loss def eval_acc(self, eval_data_path): - eval_dataset = create_deeploc_dataset(eval_data_path, self.t5_tokenizer, + """eval accuracy data file""" + eval_dataset = create_deeploc_dataset(eval_data_path, self.t5_tokenizer, \ batch_size=self.train_conf.batch_size, cate_name=self.cate_name) return self.eval_fn(eval_dataset) def predict(self, data): + """predict""" logits = self.net(*data) softmax = ops.Softmax(axis=1) probabilities = softmax(logits) predicted_labels = ops.Argmax(axis=1)(probabilities).asnumpy() predicted_cates = map_label_to_category(predicted_labels, self.label_to_cate) - return predicted_cates \ No newline at end of file + return predicted_cates diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_configuration.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_configuration.py similarity index 94% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_configuration.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_configuration.py index 712eb66c9..7ffbe7e40 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_configuration.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_configuration.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"protT5 downstream task configure" -protT5downtask_configuration = { +"prott5 downstream task configure" +prott5downtask_configuration = { "protT5_base": "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml", "protT5downtask_predict": "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml", "protT5downtask_train": "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml" -} \ No newline at end of file +} diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_nets.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_nets.py similarity index 86% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_nets.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_nets.py index 26772ddd7..5d390561e 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_nets.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_nets.py @@ -26,7 +26,7 @@ EMBEDIING_LENGTH = 1024 class ConvNet(nn.Cell): - """acid token level predictor; using convolution net to convergence of local information.""" + """Acid token level predictor; using convolution net to convergence of local information.""" def __init__(self): super().__init__() # CNN weights are trained on ProtT5 embeddings @@ -37,12 +37,15 @@ class ConvNet(nn.Cell): ]) n_final_in = 32 - self.dssp3_classifier = nn.Conv2d(n_final_in, 3, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) - self.dssp8_classifier = nn.Conv2d(n_final_in, 8, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) - self.diso_classifier = nn.Conv2d(n_final_in, 2, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + self.dssp3_classifier = nn.Conv2d(n_final_in, 3, kernel_size=(7, 1), \ + pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + self.dssp8_classifier = nn.Conv2d(n_final_in, 8, kernel_size=(7, 1), \ + pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + self.diso_classifier = nn.Conv2d(n_final_in, 2, kernel_size=(7, 1), \ + pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) def construct(self, embeddings, masks): - # IN: X = (B x L x F); OUT: (B x F x L, 1) + """construct: IN: X = (B x L x F); OUT: (B x F x L, 1)""" x = embeddings * ops.expand_dims(masks, -1) x = ops.Transpose()(x, (0, 2, 1)).unsqueeze(-1) x = self.feature_extractor(x) # OUT: (B x 32 x L x 1) @@ -53,20 +56,19 @@ class ConvNet(nn.Cell): class MeanPoolingClassifier(nn.Cell): + """Acid sequence level predictor; using mean pooling classifier.""" def __init__(self, num_classes): super().__init__() self.num_classes = num_classes - self.dense = nn.Dense(EMBEDIING_LENGTH, 32, activation='relu') self.classifier = nn.Dense(32, num_classes) self.dropout = nn.Dropout(p=0.1) def construct(self, embeddings, masks): + """construct""" masks = ops.cast(masks, ms.float32) masked_inputs = embeddings * ops.expand_dims(masks, -1) - mean_pooled = ops.ReduceMean(keep_dims=False)(masked_inputs, 1) - mean_pooled = self.dropout(mean_pooled) compressed = self.dense(mean_pooled) output = self.classifier(compressed) @@ -87,13 +89,14 @@ class EmbeddingTaskNet(nn.Cell): param.requires_grad = False def construct(self, inputs, masks): + """construct""" masks = ops.cast(masks, ms.float32) embeddings = self.t5.encoder_forward(inputs, masks) output = self.downstream_net(embeddings, masks) return output def load_from_pretrained(self, config_path): - # load downstream task checkpoint + """load downstream task checkpoint""" non_pretrained_param_dict = ms.load_checkpoint(config_path) param_not_load, _ = ms.load_param_into_net(self.downstream_net, non_pretrained_param_dict) self.downstream_net.set_train(False) @@ -101,6 +104,7 @@ class EmbeddingTaskNet(nn.Cell): logger.warning("Not Loaded param list: %s", param_not_load) def save_checkpoint(self, model_path): + """save checkpoint""" non_pretrained_param_dict = {} for param in self.downstream_net.trainable_params(): non_pretrained_param_dict[param.name] = param.data diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_task.py similarity index 97% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_task.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_task.py index 7d6047f2d..70da08617 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/downstream_task.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_task.py @@ -71,14 +71,16 @@ class BaseTask: pass def train_step(self, *args): + """train step""" loss, grads = self.grad_fn(*args) self.optimizer(grads) return loss def train(self): + """train""" weights = self.net.trainable_params() self.grad_fn = value_and_grad(self.forward_fn, None, weights) - + logger.info("Begin training...") for epoch in range(self.epochs): logger.info("Epoch: %d", epoch) @@ -103,4 +105,4 @@ class BaseTask: if self.checkpoint_save_path: self.net.save_checkpoint(self.checkpoint_save_path) - logger.info("Checkpoint dumpped successful") \ No newline at end of file + logger.info("Checkpoint dumpped successful") diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/hhblits_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py similarity index 93% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/hhblits_task.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py index dd224f814..56160fe88 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/hhblits_task.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py @@ -73,24 +73,28 @@ class HHblitsTask(BaseTask): metric_q8.update(logits2, d8labels) dataset.reset() - _m3acc = metric_q3.get() - _m8acc = metric_q8.get() - return _m3acc, _m8acc + m3acc = metric_q3.get() + m8acc = metric_q8.get() + return m3acc, m8acc def eval_fn(self, dataset): + """eval dataset""" return HHblitsTask.__eval_fn(self.net, dataset) def eval_acc(self, eval_data_path): + """eval accuracy of data file""" eval_dataset = create_hhblits_dataset(eval_data_path, self.t5_tokenizer, self.train_conf.batch_size) - _m3acc, _m8acc = self.eval_fn(eval_dataset) - return _m3acc, _m8acc + m3acc, m8acc = self.eval_fn(eval_dataset) + return m3acc, m8acc def token_level_crossentoryloss(self, logits, labels, num_classes, loss_fn): + """token level crossentory loss""" activate_labels = labels.view(-1) activate_logits = logits.view(-1, num_classes) return loss_fn(activate_logits, activate_labels) def predict(self, data): + """predict""" logits1, logits2, _ = self.net(*data) softmax = ops.Softmax(axis=-1) probabilities1 = softmax(logits1) @@ -111,4 +115,3 @@ class HHblitsTask(BaseTask): loss2 = self.token_level_crossentoryloss( logits2, d8labels, 8, self.loss_fn) return 0.5 * loss1 + 0.5 * loss2 - \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/protT5_downstream_tasks.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py similarity index 95% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/protT5_downstream_tasks.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py index e0006478b..fbea0e9de 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/protT5_downstream_tasks.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""ProtT5 Downstream Task class implment.""" +"""ProtT5 Downstream Task class implement.""" from mindspore import jit from ...model import Model @@ -54,6 +54,7 @@ class ProtT5DownstreamTasks(Model): return self.network.predict(data) def eval_acc(self, data_path): + """eval accuracy of data file""" if self.task_name == "hhblits": m3acc, m8acc = self.network.eval_acc(data_path) print("Accuracy Q3 %.4f; Q8 %.4f" % (m3acc, m8acc)) @@ -62,12 +63,13 @@ class ProtT5DownstreamTasks(Model): print("Accuracy %.4f" % acc) def train(self): + """train""" self.network.train() @jit def train_step(self, data): self.network.train_step(*data) - + @jit def _jit_forward(self, data): pass diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/task_datasets.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py similarity index 83% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/task_datasets.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py index 954cea99c..d3885045c 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/downstream/task_datasets.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py @@ -30,15 +30,16 @@ LABEL_MASKER = -100 # Category Definitions in Data MEMBRANE_CATES = {'M': 0, 'S': 1, 'U': 1} LOC_CATES = { - 'Cell.membrane': 0, 'Cytoplasm': 1, 'Endoplasmic.reticulum': 2, - 'Golgi.apparatus': 3, 'Lysosome/Vacuole': 4, 'Mitochondrion': 5, - 'Nucleus': 6, 'Peroxisome': 7, 'Plastid': 8, 'Extracellular': 9 - } + 'Cell.membrane': 0, 'Cytoplasm': 1, 'Endoplasmic.reticulum': 2, + 'Golgi.apparatus': 3, 'Lysosome/Vacuole': 4, 'Mitochondrion': 5, + 'Nucleus': 6, 'Peroxisome': 7, 'Plastid': 8, 'Extracellular': 9 + } HHBLITS_D3_CATES = {'C': 0, 'E': 1, 'H': 2} HHBLITS_D8_CATES = {'C': 0, 'E': 1, 'H': 2, 'B': 3, 'G': 4, 'I': 5, 'S': 6, 'T': 7} def reverse_dict(original_dict): + """reverse dict""" return {value: key for key, value in original_dict.items()} MEMBRANE_LABEL_TO_CATE = {0: 'M', 1: 'S'} @@ -48,22 +49,25 @@ HHBLITS_D8_LABEL_TO_CATE = reverse_dict(HHBLITS_D8_CATES) def map_label_to_category(labels, dct): + """map label to category""" vectorized_map = np.vectorize(lambda label: dct.get(label, '')) str_labels = vectorized_map(labels) return str_labels def seq2array(seq): + """sequence to numpy array""" return np.array(seq).astype(np.int32) def pad_trunc_addspecial(seq, max_length, sp=LABEL_MASKER, pad=0, add_special_tokens=True): + """pad trunc addspecial""" if len(seq) > max_length: if add_special_tokens: seq = seq[:max_length-1] else: seq = seq[:max_length] - + if add_special_tokens: seq.append(sp) @@ -72,6 +76,7 @@ def pad_trunc_addspecial(seq, max_length, sp=LABEL_MASKER, pad=0, add_special_to def get_task_info(cate_name): + """get task info""" if cate_name == 'loc': return LOC_CATES elif cate_name == 'membrane': @@ -81,6 +86,7 @@ def get_task_info(cate_name): def apply_tokenizer(text, tokenizer): + """apply tokenizer""" text = re.sub(r"[UZOB]", "X", text) tokens = tokenizer(text, padding='max_length', truncation=True, add_special_tokens=True, max_length=512) ids, masks = tokens['input_ids'], tokens['attention_mask'] @@ -88,7 +94,7 @@ def apply_tokenizer(text, tokenizer): def create_deeploc_dataset(file_path, tokenizer, batch_size=32, cate_name=''): - # read CSV file + """create deeploc dataset""" df = pd.read_csv(file_path) res = [] df = df.rename(columns=lambda x: x.strip()) @@ -109,9 +115,10 @@ def create_deeploc_dataset(file_path, tokenizer, batch_size=32, cate_name=''): def load_hhblits_dataset(path): + """load hhblits dataset""" df = pd.read_csv(path, names=['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask'], skiprows=1) df = df.rename(columns=lambda x: x.strip()) - + input_fixed = ["".join(seq.split()) for seq in df['input']] input_fixed = [re.sub(r"[UZOB]", "X", seq) for seq in input_fixed] seqs = [" ".join(seq) for seq in input_fixed] @@ -129,22 +136,22 @@ def load_hhblits_dataset(path): def create_hhblits_dataset(file_path, tokenizer, batch_size=32): - # read CSV file + """create hhblits dataset""" seqs, d3_labels, d8_labels, disorder = load_hhblits_dataset(file_path) res = [] for seq, d3, d8, diso in zip(seqs, d3_labels, d8_labels, disorder): ids, masks = apply_tokenizer(seq, tokenizer) - - _d3 = [HHBLITS_D3_CATES[x.strip()] for x in d3] - _d3 = pad_trunc_addspecial(_d3, 512, pad=LABEL_MASKER) - _d8 = [HHBLITS_D8_CATES[x.strip()] for x in d8] - _d8 = pad_trunc_addspecial(_d8, 512, pad=LABEL_MASKER) + d3 = [HHBLITS_D3_CATES[x.strip()] for x in d3] + d3 = pad_trunc_addspecial(d3, 512, pad=LABEL_MASKER) + + d8 = [HHBLITS_D8_CATES[x.strip()] for x in d8] + d8 = pad_trunc_addspecial(d8, 512, pad=LABEL_MASKER) - _diso = [int(float(x.strip())) for x in diso] - _diso = pad_trunc_addspecial(_diso, 512, pad=LABEL_MASKER) - - eles = [ids, masks, _d3, _d8, _diso] + diso = [int(float(x.strip())) for x in diso] + diso = pad_trunc_addspecial(diso, 512, pad=LABEL_MASKER) + + eles = [ids, masks, d3, d8, diso] eles_tp = [seq2array(x) for x in eles] res.append(tuple(eles_tp)) @@ -154,7 +161,7 @@ def create_hhblits_dataset(file_path, tokenizer, batch_size=32): dataset = dataset.batch(batch_size=batch_size) return dataset - + class ProtT5TaskDataSet(DataSet): """ProtT5 downstream task dataSet""" def __init__(self, config): @@ -165,6 +172,8 @@ class ProtT5TaskDataSet(DataSet): self.t5_tokenizer = T5Tokenizer.from_pretrained(config.t5_config_path) self.phase = None + super().__init__() + # pylint: disable=E0302 def __getitem__(self): pass @@ -180,7 +189,7 @@ class ProtT5TaskDataSet(DataSet): def process(self, data, **kwargs): return seqs_tokenizer(data, self.t5_tokenizer, return_tensors="ms") - def set_training_data_src(self, data_source, **kwargs): + def set_training_data_src(self, data_source): self.data_path = data_source def download(self, path=None): @@ -189,10 +198,11 @@ class ProtT5TaskDataSet(DataSet): def data_parse(self, idx): pass - def create_iterator(self, num_epochs, cate_name = ''): + def create_iterator(self, num_epochs, cate_name=''): if self.task_name == "hhblits": self.dataset = create_hhblits_dataset(self.data_path, self.t5_tokenizer, self.batch_size) else: - self.dataset = create_deeploc_dataset(self.data_path, self.t5_tokenizer, self.batch_size, cate_name=cate_name) + self.dataset = create_deeploc_dataset(self.data_path, self.t5_tokenizer, self.batch_size, \ + cate_name=cate_name) return self.dataset diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/__init__.py similarity index 98% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/__init__.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/__init__.py index 175a0ece7..e8806c147 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/__init__.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Module""" \ No newline at end of file +"""Module""" diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/optimization.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/optimization.py similarity index 97% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/optimization.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/optimization.py index 68e8c2a66..0392e8012 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/optimization.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/optimization.py @@ -29,8 +29,9 @@ class WarmUpPolynomialDecayLR(LearningRateSchedule): self.end_learning_rate = end_learning_rate self.decay_steps = decay_steps self.power = power - + def construct(self, global_step): + """construct""" # warmup lr warmup_percent = global_step.astype(mindspore.float32) / self.warmup_steps warmup_learning_rate = self.learning_rate * warmup_percent @@ -45,6 +46,7 @@ class WarmUpPolynomialDecayLR(LearningRateSchedule): def create_optimizer(model, init_lr, optim_type, weight_decay=0.0): + """create optimizer""" if optim_type == 'adafactor': optim = AdaFactor(model.trainable_params()) elif weight_decay > 0: @@ -52,4 +54,4 @@ def create_optimizer(model, init_lr, optim_type, weight_decay=0.0): else: optim = nn.Adam(model.trainable_params(), learning_rate=init_lr) - return optim \ No newline at end of file + return optim diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_configuration.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_configuration.py similarity index 97% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_configuration.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_configuration.py index 1900dc847..f43d4e461 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_configuration.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_configuration.py @@ -13,11 +13,11 @@ # limitations under the License. # ============================================================================ "protT5 pretrain task configure" -protT5pretrain_configuration = { +prott5pretrain_configuration = { "protT5_base": "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml", "protT5downtask_predict": "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml", "protT5downtask_train": "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml" -} \ No newline at end of file +} diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_dataloader.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py similarity index 96% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_dataloader.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py index 9d3f1cc9b..758d24eae 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_dataloader.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py @@ -46,12 +46,14 @@ class EncDecIds(): def find_mindrecord_files(directory): + """find mindrecord files""" files = os.listdir(directory) mindrecord_files = [os.path.join(directory, f) for f in files if f.endswith('.mindrecord')] return mindrecord_files def create_pretrain_dataset(mr_files, batch_size, epochs, rank_size=0, rank_id=0): + """create pretrain dataset""" if rank_size > 0: dataset = ds.MindDataset(dataset_files=mr_files, columns_list=["raw_ids"], num_shards=rank_size, shard_id=rank_id, shuffle=True) @@ -78,6 +80,8 @@ class ProtT5TrainDataSet(DataSet): self.t5_config_path = config.t5_config_path self.tokenizer = None + super().__init__() + # pylint: disable=E0302 def __getitem__(self): pass @@ -99,9 +103,9 @@ class ProtT5TrainDataSet(DataSet): self.tokenizer = T5Tokenizer.from_pretrained(self.t5_config_path) return seqs_tokenizer(data, self.tokenizer, return_tensors=re_type) - def set_training_data_src(self, data_source, **kwargs): + def set_training_data_src(self, data_source): self.data_path = data_source - + def download(self, path=None): pass @@ -113,4 +117,3 @@ class ProtT5TrainDataSet(DataSet): self.dataset = create_pretrain_dataset(mr_files, self.batch_size, num_epochs, rank_size=rank_size, rank_id=rank_id) data_loader = self.dataset.create_tuple_iterator() return data_loader - diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_protT5.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py similarity index 92% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_protT5.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py index 23c51322b..331b8f6f4 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/pretrain_protT5.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py @@ -65,6 +65,7 @@ class ProtT5(Model): def init_context(self): + """init context""" if self.use_parallel: init() self.rank_id = get_rank() @@ -75,16 +76,17 @@ class ProtT5(Model): else: ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") - + def init_trainer(self): + """init trainer""" if self.train_conf.save_ckpt_path: os.makedirs(self.train_conf.save_ckpt_path, exist_ok=True) # data_loader dataset_path = find_mindrecord_files(self.train_conf.train_data_path) - train_dataset = create_pretrain_dataset(dataset_path, self.train_conf.batch_size, self.train_conf.epochs, - rank_size=self.rank_size, rank_id=self.rank_id) + train_dataset = create_pretrain_dataset(dataset_path, self.train_conf.batch_size, self.train_conf.epochs, \ + rank_size=self.rank_size, rank_id=self.rank_id) self.train_dataloader = train_dataset.create_tuple_iterator() num_train_steps = train_dataset.get_dataset_size() @@ -114,16 +116,18 @@ class ProtT5(Model): self.grad_fn = value_and_grad(self.forward_fn, None, weights, has_aux=False) def forward_fn(self, input_ids, input_mask, decode_ids): + """forward loss""" loss = self.network(input_ids, input_mask, decode_ids) return loss def save_checkpoint(self, train_step_nums): + """save checkpoint""" if self.rank_id == 0: filename = generate_checkpoint_filename(self.train_conf.save_ckpt_path, train_step_nums) ms.save_checkpoint(self.network, filename) def train(self): - # train begin + """train""" loss_total = 0 cur_step_nums, train_step_nums, skip_step_nums = 0, 0, 0 cur_time, avg_time = time.time(), 0 @@ -139,7 +143,7 @@ class ProtT5(Model): else: logger.warning(f"grads overflow, skip step {cur_step_nums}; loss: {loss}") skip_step_nums += 1 - + if train_step_nums % PRINT_ITERS == 0 and train_step_nums != 0: print_time = time.time() total_time = print_time - cur_time @@ -147,10 +151,10 @@ class ProtT5(Model): avg_time = total_time / (PRINT_ITERS + skip_step_nums) logger.info(f"avg_time(ms): {avg_time * 1000:2f}, " - f"cur_step: {cur_step_nums}, " - f"skip_steps: {skip_step_nums:3d}, " - f"train_step: {train_step_nums}, " - f"loss: {loss_total/PRINT_ITERS:f}, ") + f"cur_step: {cur_step_nums}, " + f"skip_steps: {skip_step_nums:3d}, " + f"train_step: {train_step_nums}, " + f"loss: {loss_total/PRINT_ITERS:f}, ") loss_total = 0 skip_step_nums = 0 @@ -158,17 +162,18 @@ class ProtT5(Model): # saving ckpt per N steps or last step if train_step_nums % self.train_conf.save_steps == 0: self.save_checkpoint(train_step_nums) - + cur_step_nums += 1 self.save_checkpoint(train_step_nums) logger.info("Pretrain done!") - + @ms.jit def _train_step(self, input_ids, input_mask, decode_ids): + """train step jit function""" loss, grads = self.grad_fn(input_ids, input_mask, decode_ids) - + if self.use_parallel: grads = self.grad_reducer(grads) @@ -184,9 +189,11 @@ class ProtT5(Model): return loss, is_finite def train_step(self, data): + """train step""" return self._train_step(*data) def predict(self, data, mode="embedding"): + """predict""" self.network.set_train(False) token_ids, attention_mask = data if mode == "generate": @@ -203,7 +210,7 @@ class ProtT5(Model): def forward(self, data): pass - + def backward(self, data): pass @@ -211,4 +218,4 @@ class ProtT5(Model): pass def _pynative_forward(self, data): - pass \ No newline at end of file + pass diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/t5_modeling.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/t5_modeling.py similarity index 96% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/t5_modeling.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/t5_modeling.py index 214b45302..379f5f27d 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/pretrain/t5_modeling.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/t5_modeling.py @@ -26,10 +26,12 @@ logger = logging.getLogger(__name__) def set_data(weight, init_distribution): + """set data weight""" weight.set_data(initializer(init_distribution, weight.shape, weight.dtype)) def init_cell(cell, name, config): + """init cell""" factor = config.initializer_factor if "layernorm" in name: set_data(cell.gamma, Constant(factor * 1.0)) @@ -57,18 +59,20 @@ def init_cell(cell, name, config): def init_t5_weights(cell, config, prefix=''): + """init t5 weights""" if hasattr(cell, 'add_name'): return cell.add_flags(add_name=prefix) init_cell(cell, prefix, config) - + for name, sub_cell in cell.cells_and_names(): hier_name = prefix + "." + name init_t5_weights(sub_cell, config, prefix=hier_name) def trans_to_transformer_config(parallel_config): + """trans_to_transformer_config""" if not parallel_config: return default_transformer_config @@ -76,13 +80,14 @@ def trans_to_transformer_config(parallel_config): def create_model(config_path, load_model_path=None, parallel_config=None, from_pretrained=False): + """create model""" if from_pretrained: return T5WithLoss.from_pretrained(config_path) base_config = AutoConfig.from_pretrained(config_path) base_config.parallel_config = trans_to_transformer_config(parallel_config) model = T5WithLoss(base_config) - + if load_model_path: # load from checkpoint path param_dict = mindspore.load_checkpoint(load_model_path) @@ -94,4 +99,3 @@ def create_model(config_path, load_model_path=None, parallel_config=None, from_p logger.info("pretrain: inited successful") return model - diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/convert_weight.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/convert_weight.py similarity index 100% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/convert_weight.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/convert_weight.py diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/trans_csv_to_mindrecord.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py similarity index 96% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/trans_csv_to_mindrecord.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py index b92711458..b1ce79501 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/scripts/trans_csv_to_mindrecord.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py @@ -33,6 +33,7 @@ file_index = 0 def process_df(row, tokenizer): + """process df""" text = re.sub(r"[UZOB]", "X", row['text']) tokens = tokenizer(" ".join(text), truncation=True, add_special_tokens=True, max_length=512) token_ids = tokens['input_ids'] @@ -44,6 +45,7 @@ def process_df(row, tokenizer): def get_writer(output_dir): + "writer" global file_index file_name = os.path.join(output_dir, f"data_{file_index}.mindrecord") writer = FileWriter(file_name, shard_num=1, overwrite=True) @@ -90,6 +92,7 @@ def converse_file(csv_file_path, num_samples, output_dir, tokenizer): def run(file_path, num_samples, output_dir): + """run""" tokenizer = T5Tokenizer.from_pretrained(args.t5_config_path) if os.path.isfile(file_path) and file_path.endswith('csv'): converse_file(file_path, num_samples, output_dir, tokenizer) @@ -102,7 +105,7 @@ def run(file_path, num_samples, output_dir): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--number_samples", default=-1, type=int, - help="Choose maxium process data sample number.") + help="Choose maximum process data sample number.") parser.add_argument("--data_dir", type=str, required=True, help="Data path to converse to mindrecords; it can file or dir.") parser.add_argument("--output_dir", type=str, required=True, help="Data path of output.") diff --git a/MindSPONGE/src/mindsponge/pipeline/models/protT5/utils/utils.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py similarity index 93% rename from MindSPONGE/src/mindsponge/pipeline/models/protT5/utils/utils.py rename to MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py index b1a90f23b..12fd6fb0b 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/protT5/utils/utils.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py @@ -28,7 +28,7 @@ def generate_checkpoint_filename(checkpoint_dir, model_info): def seqs_tokenizer(sequences, tokenizer, return_tensors=None): - # data preprocess; UZOB is rare which are replaced in ProtT5 model + """tokenizer; data preprocess; UZOB is rare which are replaced in ProtT5 model""" sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences] tokens = tokenizer(sequences, padding=True, add_special_tokens=True, return_tensors=return_tensors) return (tokens['input_ids'], tokens["attention_mask"]) \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/pipeline.py b/MindSPONGE/src/mindsponge/pipeline/pipeline.py index 65c7901f7..60a41859b 100644 --- a/MindSPONGE/src/mindsponge/pipeline/pipeline.py +++ b/MindSPONGE/src/mindsponge/pipeline/pipeline.py @@ -35,8 +35,8 @@ from .models import RASP, RASPDataSet, rasp_configuration from .models import Multimer, MultimerDataSet, multimer_configuration from .models import ProteinMpnn, ProteinMpnnDataset, proteinmpnn_configuration from .models import UFold, UFoldDataSet, ufold_configuration -from .models import ProtT5, ProtT5TrainDataSet, protT5pretrain_configuration -from .models import ProtT5DownstreamTasks, ProtT5TaskDataSet, protT5downtask_configuration +from .models import ProtT5, ProtT5TrainDataSet, prott5pretrain_configuration +from .models import ProtT5DownstreamTasks, ProtT5TaskDataSet, prott5downtask_configuration model_card = { @@ -55,8 +55,9 @@ model_card = { "Proteinmpnn": {"model": ProteinMpnn, "dataset": ProteinMpnnDataset, "config": proteinmpnn_configuration}, "RASP": {"model": RASP, "dataset": RASPDataSet, "config": rasp_configuration}, "UFold": {"model": UFold, "dataset": UFoldDataSet, "config": ufold_configuration}, - "ProtT5": {"model": ProtT5, "dataset": ProtT5TrainDataSet, "config": protT5pretrain_configuration}, - "ProtT5Downstream": {"model": ProtT5DownstreamTasks, "dataset": ProtT5TaskDataSet, "config": protT5downtask_configuration} + "ProtT5": {"model": ProtT5, "dataset": ProtT5TrainDataSet, "config": prott5pretrain_configuration}, + "ProtT5Downstream": {"model": ProtT5DownstreamTasks, "dataset": ProtT5TaskDataSet, + "config": prott5downtask_configuration} } -- Gitee From 6cbc33e9d346fe623604ebe36c90fb0a2cadeecb Mon Sep 17 00:00:00 2001 From: "L.L" <920817420@qq.com> Date: Wed, 23 Oct 2024 16:57:31 +0800 Subject: [PATCH 3/6] fix2 --- MindSPONGE/applications/model_cards/ProtT5.md | 10 ++++---- .../models/prot_t5/downstream/deeploc_task.py | 1 + .../models/prot_t5/downstream/hhblits_task.py | 10 +++++--- .../downstream/prott5_downstream_tasks.py | 3 ++- .../prot_t5/downstream/task_datasets.py | 24 +++++++------------ .../prot_t5/pretrain/pretrain_dataloader.py | 9 ++++--- .../prot_t5/pretrain/pretrain_prott5.py | 13 ++++++---- .../scripts/trans_csv_to_mindrecord.py | 10 ++++---- .../pipeline/models/prot_t5/utils/utils.py | 4 ++-- 9 files changed, 46 insertions(+), 38 deletions(-) diff --git a/MindSPONGE/applications/model_cards/ProtT5.md b/MindSPONGE/applications/model_cards/ProtT5.md index 34dbf9204..8b19e10fb 100644 --- a/MindSPONGE/applications/model_cards/ProtT5.md +++ b/MindSPONGE/applications/model_cards/ProtT5.md @@ -35,9 +35,11 @@ python scripts/convert_weight.py --layers 24 --torch_path pytorch_model.bin --mi ### Dependencies -* mindspore >= 2.3.0 -* mindformers >= 1.2.0 -* sentencepiece >= 0.2.0 +``` +mindspore >= 2.3.0 +mindformers >= 1.2.0 +sentencepiece >= 0.2.0 +``` ### ProtT5预测 @@ -101,7 +103,7 @@ print("Output:", res) # 评估测试集; 项目主页有数据集下载地址 eval_data_path = "./dataset/deeploc_test_set.csv" pipe.model.eval_acc(eval_data_path) -# Accuracy 0.8129 +# Accuracy 0.8129 # train # config文件中设置好train_data_path和eval_data_path等参数 diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py index fddc063bd..0b17a7e3e 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py @@ -77,6 +77,7 @@ class DeeplocTask(BaseTask): """eval dataset""" return DeeplocTask.__eval_fn(self.net, dataset) + # pylint: disable=W0221 def forward_fn(self, inputs, masks, targets): """forward loss""" logits = self.net(inputs, masks) diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py index 56160fe88..c7c28fbc5 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +"""Pretrain property prediction task; hhblits dataset.""" from mindspore import nn import mindspore.ops as ops @@ -51,7 +52,8 @@ class HHblitsTask(BaseTask): self.net.load_from_pretrained(self.checkpoint_path) if self.train_conf.train_data_path: - self.train_dataset = create_hhblits_dataset(self.train_conf.train_data_path, self.t5_tokenizer, self.train_conf.batch_size) + self.train_dataset = create_hhblits_dataset(self.train_conf.train_data_path, + self.t5_tokenizer, self.train_conf.batch_size) batch_num = self.train_dataset.get_dataset_size() learning_rate = lr_secheduler(self.train_conf.lr, batch_num, self.train_conf.epochs) self.optimizer = nn.Adam(self.net.trainable_params(), learning_rate=learning_rate) @@ -59,7 +61,8 @@ class HHblitsTask(BaseTask): # eval if self.train_conf.eval_data_path: - self.eval_dataset = create_hhblits_dataset(self.train_conf.eval_data_path, self.t5_tokenizer, self.train_conf.batch_size) + self.eval_dataset = create_hhblits_dataset(self.train_conf.eval_data_path, + self.t5_tokenizer, self.train_conf.batch_size) @staticmethod def __eval_fn(model_fn, dataset): @@ -107,7 +110,8 @@ class HHblitsTask(BaseTask): predicted_cates2 = map_label_to_category(predicted_labels2, HHBLITS_D8_LABEL_TO_CATE) return predicted_cates1, predicted_cates2 - def forward_fn(self, inputs, masks, d3labels, d8labels, disolabels): + # pylint: disable=W0221 + def forward_fn(self, inputs, masks, d3labels, d8labels): """multitask loss""" logits1, logits2, _ = self.net(inputs, masks) loss1 = self.token_level_crossentoryloss( diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py index fbea0e9de..9df864401 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py @@ -47,9 +47,10 @@ class ProtT5DownstreamTasks(Model): def forward(self, data): pass - def backward(self, feat): + def backward(self, data): pass + # pylint: disable=W0221 def predict(self, data): return self.network.predict(data) diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py index d3885045c..19f1db143 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py @@ -79,10 +79,9 @@ def get_task_info(cate_name): """get task info""" if cate_name == 'loc': return LOC_CATES - elif cate_name == 'membrane': + if cate_name == 'membrane': return MEMBRANE_CATES - else: - return {} + return {} def apply_tokenizer(text, tokenizer): @@ -129,17 +128,14 @@ def load_hhblits_dataset(path): label_fixed8 = ["".join(label.split()) for label in df['dssp8']] d8_labels = [list(label) for label in label_fixed8] - disorder_fixed = [" ".join(disorder.split()) for disorder in df['disorder']] - disorder = [disorder.split() for disorder in disorder_fixed] - - return seqs, d3_labels, d8_labels, disorder + return seqs, d3_labels, d8_labels def create_hhblits_dataset(file_path, tokenizer, batch_size=32): """create hhblits dataset""" - seqs, d3_labels, d8_labels, disorder = load_hhblits_dataset(file_path) + seqs, d3_labels, d8_labels = load_hhblits_dataset(file_path) res = [] - for seq, d3, d8, diso in zip(seqs, d3_labels, d8_labels, disorder): + for seq, d3, d8 in zip(seqs, d3_labels, d8_labels): ids, masks = apply_tokenizer(seq, tokenizer) d3 = [HHBLITS_D3_CATES[x.strip()] for x in d3] @@ -148,15 +144,12 @@ def create_hhblits_dataset(file_path, tokenizer, batch_size=32): d8 = [HHBLITS_D8_CATES[x.strip()] for x in d8] d8 = pad_trunc_addspecial(d8, 512, pad=LABEL_MASKER) - diso = [int(float(x.strip())) for x in diso] - diso = pad_trunc_addspecial(diso, 512, pad=LABEL_MASKER) - - eles = [ids, masks, d3, d8, diso] + eles = [ids, masks, d3, d8] eles_tp = [seq2array(x) for x in eles] res.append(tuple(eles_tp)) random.shuffle(res) - dataset = ds.GeneratorDataset(res, column_names=["inputs", "masks", "d3labels", "d8labels", "disolabels"]) + dataset = ds.GeneratorDataset(res, column_names=["inputs", "masks", "d3labels", "d8labels"]) dataset = dataset.shuffle(buffer_size=128) dataset = dataset.batch(batch_size=batch_size) return dataset @@ -198,11 +191,12 @@ class ProtT5TaskDataSet(DataSet): def data_parse(self, idx): pass + # pylint: disable=W0221 def create_iterator(self, num_epochs, cate_name=''): if self.task_name == "hhblits": self.dataset = create_hhblits_dataset(self.data_path, self.t5_tokenizer, self.batch_size) else: self.dataset = create_deeploc_dataset(self.data_path, self.t5_tokenizer, self.batch_size, \ - cate_name=cate_name) + cate_name=cate_name) return self.dataset diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py index 758d24eae..a60b015b7 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py @@ -56,11 +56,11 @@ def create_pretrain_dataset(mr_files, batch_size, epochs, rank_size=0, rank_id=0 """create pretrain dataset""" if rank_size > 0: dataset = ds.MindDataset(dataset_files=mr_files, columns_list=["raw_ids"], - num_shards=rank_size, shard_id=rank_id, shuffle=True) + num_shards=rank_size, shard_id=rank_id, shuffle=True) else: dataset = ds.MindDataset(dataset_files=mr_files, columns_list=["raw_ids"], shuffle=True) dataset = dataset.map(operations=EncDecIds(0.15), input_columns=["raw_ids"], - output_columns=["input_ids", "masks", "decode_ids"]) + output_columns=["input_ids", "masks", "decode_ids"]) # default: pad = 0 padding_shape = ([512], 0) @@ -94,6 +94,7 @@ class ProtT5TrainDataSet(DataSet): def set_phase(self, phase): self.phase = phase + # pylint: disable=W0221 def process(self, data, mode="embedding"): re_type = "ms" if mode == "generate": @@ -112,8 +113,10 @@ class ProtT5TrainDataSet(DataSet): def data_parse(self, idx): pass + # pylint: disable=W0221 def create_iterator(self, num_epochs, rank_size=0, rank_id=0): mr_files = find_mindrecord_files(self.data_path) - self.dataset = create_pretrain_dataset(mr_files, self.batch_size, num_epochs, rank_size=rank_size, rank_id=rank_id) + self.dataset = create_pretrain_dataset(mr_files, self.batch_size, num_epochs, + rank_size=rank_size, rank_id=rank_id) data_loader = self.dataset.create_tuple_iterator() return data_loader diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py index 331b8f6f4..32dcfd550 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py @@ -72,9 +72,9 @@ class ProtT5(Model): self.rank_size = get_group_size() ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend", device_id=self.rank_id) ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, parameter_broadcast=True, - device_num=self.rank_size, gradients_mean=True) + device_num=self.rank_size, gradients_mean=True) - else: + else: ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") @@ -192,21 +192,24 @@ class ProtT5(Model): """train step""" return self._train_step(*data) + # pylint: disable=W0221 def predict(self, data, mode="embedding"): """predict""" self.network.set_train(False) token_ids, attention_mask = data if mode == "generate": - """Generate the sequence of input texts.""" + # Generate the sequence of input texts output_ids = self.network.generate(token_ids, do_sample=False) output_tokens = self.tokenizer.decode(output_ids, skip_special_tokens=True) return output_tokens - elif mode == "embedding": - """Embedding of the final layer of encoder""" + + if mode == "embedding": + # Embedding of the final layer of encoder outputs = self.network.encoder_forward(token_ids, attention_mask) hiddens = outputs.asnumpy() return hiddens + return None def forward(self, data): pass diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py index b1ce79501..93e783f90 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py @@ -64,7 +64,7 @@ def converse_file(csv_file_path, num_samples, output_dir, tokenizer): reader = csv.DictReader(csvfile) for row in reader: index += 1 - if num_samples > 0 and index > num_samples: + if 0 < num_samples < index: break if current_file_size > MAX_TOKENS_PER_FILE: @@ -73,7 +73,7 @@ def converse_file(csv_file_path, num_samples, output_dir, tokenizer): current_file_size = 0 sample, token_length = process_df(row, tokenizer) - + # compute current file size current_file_size += 4 * token_length data.append(sample) @@ -97,7 +97,8 @@ def run(file_path, num_samples, output_dir): if os.path.isfile(file_path) and file_path.endswith('csv'): converse_file(file_path, num_samples, output_dir, tokenizer) else: - csv_files = [os.path.join(file_path, filename) for filename in os.listdir(file_path) if filename.endswith('.csv')] + csv_files = [os.path.join(file_path, filename) for filename in os.listdir(file_path) \ + if filename.endswith('.csv')] for cfile in csv_files: converse_file(cfile, num_samples, output_dir, tokenizer) @@ -113,5 +114,4 @@ if __name__ == "__main__": args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) - - run(args.data_dir, args.number_samples, args.output_dir) \ No newline at end of file + run(args.data_dir, args.number_samples, args.output_dir) diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py index 12fd6fb0b..0fb525d73 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +"""some util functions""" import os import datetime import re @@ -23,7 +24,6 @@ def generate_checkpoint_filename(checkpoint_dir, model_info): timestamp = now.strftime('%Y%m%d_%H%M%S') filename = f'model_{model_info}_{timestamp}.ckpt' filepath = os.path.join(checkpoint_dir, filename) - return filepath @@ -31,4 +31,4 @@ def seqs_tokenizer(sequences, tokenizer, return_tensors=None): """tokenizer; data preprocess; UZOB is rare which are replaced in ProtT5 model""" sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences] tokens = tokenizer(sequences, padding=True, add_special_tokens=True, return_tensors=return_tensors) - return (tokens['input_ids'], tokens["attention_mask"]) \ No newline at end of file + return (tokens['input_ids'], tokens["attention_mask"]) -- Gitee From 39b78e8d188f962c2a4c716d1f1fd2795cf99adf Mon Sep 17 00:00:00 2001 From: "L.L" <920817420@qq.com> Date: Thu, 24 Oct 2024 10:42:28 +0800 Subject: [PATCH 4/6] fix3 --- MindSPONGE/applications/model_cards/ProtT5.md | 5 +++-- .../models/prot_t5/downstream/prott5_downstream_tasks.py | 4 ++-- .../pipeline/models/prot_t5/pretrain/pretrain_prott5.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/MindSPONGE/applications/model_cards/ProtT5.md b/MindSPONGE/applications/model_cards/ProtT5.md index 8b19e10fb..297d5276f 100644 --- a/MindSPONGE/applications/model_cards/ProtT5.md +++ b/MindSPONGE/applications/model_cards/ProtT5.md @@ -11,7 +11,8 @@ ProtTrans论文中有两类下游任务,预测蛋白质相关性质和氨基 ### 模型权重获取 -可以从如下链接中下载torch版本的checkpoint;[模型页面](https://huggingface.co/Rostlab/prot_t5_xl_uniref50)。 然后使用`protT5/scripts`文件夹中的`convert_weight.py`脚本转换为mindspore支持的格式, 脚本使用方式如下: +模型权重可以从mindspore默认的[checkpoint](https://download-mindspore.osinfra.cn/mindscience/mindsponge/ProtT5/checkpoint/)仓下载,也可以下载官方的torch权重文件转换。 +torch版本的权重文件下载链接:[模型页面](https://huggingface.co/Rostlab/prot_t5_xl_uniref50),然后使用`prot_t5/scripts`文件夹中的`convert_weight.py`脚本转换为mindspore支持的格式, 脚本使用方式如下: ```shell python scripts/convert_weight.py --layers 24 --torch_path pytorch_model.bin --mindspore_path ./mindspore_t5.ckpt @@ -35,7 +36,7 @@ python scripts/convert_weight.py --layers 24 --torch_path pytorch_model.bin --mi ### Dependencies -``` +```bash mindspore >= 2.3.0 mindformers >= 1.2.0 sentencepiece >= 0.2.0 diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py index 9df864401..1e15413b7 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py @@ -29,8 +29,8 @@ class ProtT5DownstreamTasks(Model): self.mixed_precision = False self.config = config - self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/protT5_xl.ckpt" - self.checkpoint_path = "./protT5_xl.ckpt" + self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/prot_t5_xl.ckpt" + self.checkpoint_path = "./prot_t5_xl.ckpt" self.mode = config.mode self.task_name = config.task_name diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py index 32dcfd550..24ec02843 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py @@ -48,8 +48,8 @@ class ProtT5(Model): self.rank_size = 1 self.init_context() - self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/protT5_xl.ckpt" - self.checkpoint_path = "./protT5_xl.ckpt" + self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/prot_t5_xl.ckpt" + self.checkpoint_path = "./prot_t5_xl.ckpt" self.mode = config.mode self.train_conf = config.train -- Gitee From 9a83c17e03de326ce35551a0b758414ae4cd5008 Mon Sep 17 00:00:00 2001 From: "L.L" <920817420@qq.com> Date: Thu, 24 Oct 2024 11:33:00 +0800 Subject: [PATCH 5/6] add requirements --- MindSPONGE/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MindSPONGE/requirements.txt b/MindSPONGE/requirements.txt index ae2e689c0..9b940d1b5 100644 --- a/MindSPONGE/requirements.txt +++ b/MindSPONGE/requirements.txt @@ -11,4 +11,6 @@ descriptastorus == 2.6.1 pyparsing >= 3.0.7 rdkit bio -scikit-learn \ No newline at end of file +scikit-learn +mindformers >= 1.2.0 +sentencepiece >= 0.2.0 \ No newline at end of file -- Gitee From dff80f29a10af22d4ab843c4457c76fc8135c7b3 Mon Sep 17 00:00:00 2001 From: "L.L" <920817420@qq.com> Date: Thu, 24 Oct 2024 14:01:12 +0800 Subject: [PATCH 6/6] add init --- .../pipeline/models/prot_t5/utils/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/__init__.py diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/__init__.py new file mode 100644 index 000000000..e8806c147 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Module""" -- Gitee