diff --git a/MindSPONGE/applications/model_cards/ProtT5.md b/MindSPONGE/applications/model_cards/ProtT5.md new file mode 100644 index 0000000000000000000000000000000000000000..297d5276f2f0e82634709870a7e3a3a0a8c62c60 --- /dev/null +++ b/MindSPONGE/applications/model_cards/ProtT5.md @@ -0,0 +1,161 @@ +# ProtT5 + +## 模型介绍 + +计算生物学和生物信息学从蛋白质序列中获得了大量的数据,非常适合使用自然语言处理中的语言模型。这些语言模型以低推理成本达到了新的预测效果。ProtTrans提供了先进的预训练模型用于蛋白质研究。其中,ProtT5是项目中多个预训练模型中,效果最好的。 +详细信息见项目主页及论文:[github](https://github.com/agemagician/ProtTrans) + +我们提供了Mindspore框架下, ProtT5模型的checkpoint, 预测接口, 训练接口;同时提供了ProtT5预训练模型在相关下游任务中的训练接口和预测接口。 +ProtTrans论文中有两类下游任务,预测蛋白质相关性质和氨基酸的性质,对应的分别是sample level和token level的分类模型; 实验具体信息可以参考作者论文和项目主页中的描述。 +下游任务实验的训练数据, 评测数据都可以从项目主页中提供的链接下载。 + +### 模型权重获取 + +模型权重可以从mindspore默认的[checkpoint](https://download-mindspore.osinfra.cn/mindscience/mindsponge/ProtT5/checkpoint/)仓下载,也可以下载官方的torch权重文件转换。 +torch版本的权重文件下载链接:[模型页面](https://huggingface.co/Rostlab/prot_t5_xl_uniref50),然后使用`prot_t5/scripts`文件夹中的`convert_weight.py`脚本转换为mindspore支持的格式, 脚本使用方式如下: + +```shell +python scripts/convert_weight.py --layers 24 --torch_path pytorch_model.bin --mindspore_path ./mindspore_t5.ckpt +``` + +转换完成后,需要添加yaml格式的配置文件; 具体可以参考: `model_configs/ProtT5/t5_xl.yaml`; 直接把这个文件复制到相应的目录中也可以; 配置文件参数的含义可以参考mindformers中t5_config定义。 这些文件在`MindSPONGE`项目下。 + +- 文件结构 + +```bash +# checkpoint文件组织格式如下 +└── prot_t5_xl_uniref50 + ├── prot_t5_xl_uniref50.ckpt # 权重文件; 需要从torch的bin文件转换而来 + ├── prot_t5.yaml # 网络配置文件,需要手动添加 + ├── special_tokens_map.json # tokenizer + ├── spiece.model # tokenizer model + └── tokenizer_config.json # tokenizer config +``` + +## 如何使用 + +### Dependencies + +```bash +mindspore >= 2.3.0 +mindformers >= 1.2.0 +sentencepiece >= 0.2.0 +``` + +### ProtT5预测 + +```bash +from mindsponge import PipeLine + +config_path = 'configs/t5_predict.yaml' # 根据需要改为本地路径 +pipe = PipeLine(name = "ProtT5") +pipe.set_device_id(0) +pipe.initialize(config_path=config_path) + +# pridict +data = ["A E T C Z A O", "S K T Z P"] +res = pipe.predict(data, mode="generate") +print("Generated:", res) +# Generated: ['A E T C X A X', 'S K T X P'] + +res = pipe.predict(data, mode="embedding") +print("Embedding:", res) +# Embedding: +[[[ 1.71719193e-01 -1.40796244e-01 -2.04709724e-01 ... 1.45269990e-01 + 1.47509247e-01 -7.32109100e-02] + [ 9.36630294e-02 -1.16918117e-01 -2.99756974e-01 ... 1.00125663e-01 + -2.26259604e-01 2.25636318e-01] + [ 1.93479404e-01 -9.52076018e-02 -2.92140573e-01 ... 6.69623986e-02 + 3.05505600e-02 1.31701231e-01] + ... +``` + +### ProtT5预训练 + +```bash +# 单卡; 按照配置文件配置好yaml文件 +config_path = 'configs/t5_pretrain.yaml' # 根据需要改为本地路径 +pipe = PipeLine(name = "ProtT5") +pipe.initialize(config_path=config_path) +pipe.model.init_trainer() +pipe.model.train() + +# 使用多卡并行; run_pretrain.py中代码就是单卡的代码,使用msrun启动 +msrun --worker_num=${worknum} --local_worker_num=${worknum} --master_port=8128 --log_dir=msrun_log --join=True --cluster_time_out=600 ./run_pretrain.py +``` + +### 下游任务 + +```bash +import mindspore as ms +from mindsponge import PipeLine + +pipe = PipeLine(name = "ProtT5Downstream") +pipe.set_device_id(0) +config_path = 'configs/t5_downstream_task_eval.yaml' +pipe.initialize(config_path=config_path) + +# pridict +data = ["S L R F T A S T S T P K S G S K I A K R G K K H P E P V A S W M S E Q R W A G E P E V M C T L Q H K S I A Q E A Y K N Y T I T T S A V C K L V R Q L Q Q Q A L S L Q V H F E R S E R V L S G L Q A S S L P E A L A G A T Q L L S H L D D F T A T L E R R G V F F N D A K I E R R R Y E Q H L E Q I R T V S K D T R Y S L E R Q H Y I N L E S L L D D V Q L L K R H T L I T L R L I F E R L V R V L V I S I E Q S Q C D L L L R A N I N M V A T L M N I D Y D G F R S L S D A F V Q N E A V R T L L V V V L D H K Q S S V R A L A L R A L A T L C C A P Q A I N Q L G S C G G I E I V R D I L Q V E S A G E R G A I E R R E A V S L L A Q I T A A W H G S E H R V P G L R D C A E S L V A G L A A L L Q P E"] +res = pipe.predict(data) +print("Output:", res) +# Output: ['Cytoplasm'] + +# 评估测试集; 项目主页有数据集下载地址 +eval_data_path = "./dataset/deeploc_test_set.csv" +pipe.model.eval_acc(eval_data_path) +# Accuracy 0.8129 + +# train +# config文件中设置好train_data_path和eval_data_path等参数 +# yaml文件中parallel设为True +ms.set_context(mode=ms.GRAPH_MODE, device_target='Ascend', device_id=0) +pipe = PipeLine(name = "ProtT5Downstream") +config_path = 'configs/t5_downstream_task_train.yaml' +pipe.initialize(config_path=config_path) +pipe.model.train() +``` + +### 预训练说明 + +ProtTrans主要工作是在蛋白质氨基酸序列上训练的预训练模型, 下面是模型训练相关一些说明。 + +- 数据转换 + +为了训练效率,首先需要原始数据转换成mindrecord格式。原始的预训练数据可以使用`uniref50`数据, 下面是数据转换脚本的路径及其使用方式。`number_samples`指定了想转换的样本数量,默认是`-1`转换全部数据。 + +```shell +# 参数分别是: 原始csv数据目录; 转换后的目录; 模型checkpoint路径; 转换样本数 +python scripts/trans_csv_to_mindrecord.py --data_dir ../unif50 --output_dir ../unif50_mindrecord --t5_config_path ../prot_t5_xl_uniref50 --number_samples 50000 +``` + +- T5参数配置 + +除了网络中每层的参数量和dropout比例,下面几个参数也需要注意 + +```yaml +# 初始化权重缩放比例; 一般小于等于1 +initializer_factor: 1.0 + +# 每一层的数据类型,兼容混合精度: float32 或 float16 +# T5模型中建议全部使用float32 +param_init_type: "float32" +layernorm_compute_type: "float32" +softmax_compute_type: "float32" +compute_dtype: "float32" +``` + +## 引用 + +```bash +@article{9477085, + author={Elnaggar, Ahmed and Heinzinger, Michael and Dallago, Christian and Rehawi, Ghalia and Yu, Wang and Jones, Llion and Gibbs, Tom and Feher, Tamas and Angerer, Christoph and Steinegger, Martin and Bhowmik, Debsindhu and Rost, Burkhard}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + title={ProtTrans: Towards Cracking the Language of Lives Code Through Self-Supervised Deep Learning and High Performance Computing}, + year={2021}, + volume={}, + number={}, + pages={1-1}, + doi={10.1109/TPAMI.2021.3095381} +} +``` \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2281405ed4dfe2252541611f1f6520fd1d2812a8 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml @@ -0,0 +1,13 @@ +mode: eval +task_name: "hhblits" # "hhblits" or "deeploc" +t5_config_path: "prot_t5_xl_uniref50_ms" # T5 base模型; 包含yaml,ckpt,tokenizor model的目录 +checkpoint_path: "./model_deeploc_loc.ckpt" # 下游任务的ckpt; 未开源需要自己训练 + +train: + lr: 0.001 + epochs: 4 + batch_size: 16 + train_data_path: "" # 预测时为空 + eval_data_path: "" + checkpoint_save_path: null + cate_name: '' \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml new file mode 100644 index 0000000000000000000000000000000000000000..76c1dad842f48dcbcc37dbb0b0945f388c88b1ef --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml @@ -0,0 +1,13 @@ +mode: train +task_name: "deeploc" # "hhblits" or "deeploc" +t5_config_path: "prot_t5_xl_uniref50_ms" # T5 base模型;包含yaml,ckpt,tokenizor model的目录 +checkpoint_path: null + +train: + lr: 0.001 # learning rate; 默认使用Cosine decay learning rate + epochs: 4 + batch_size: 16 + train_data_path: "./deeploc_our_train_set.csv" + eval_data_path: "./deeploc_test_set.csv" + checkpoint_save_path: null + cate_name: "membrane" # 两个任务,通过cate_name区分: membrane和loc; hhblits任务不需要 \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml new file mode 100644 index 0000000000000000000000000000000000000000..826c16ac54794c926c87542fce82bdd1b7e7b326 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml @@ -0,0 +1,16 @@ +mode: "eval" # eval或者train +parallel: False # 是否使用并行 +t5_config_path: "prot_t5_xl_uniref50_ms/" # T5 base模型;包含yaml,ckpt,tokenizor model的目录 +load_model_path: null + +# 训练相关参数 +train: + train_data_path: '' + lr: 2.0e-5 + warmup_steps: 0 + batch_size: 32 + epochs: 1 + save_steps: 20000 + save_ckpt_path: "output/" + use_clip_grad: True + max_grad_norm: 1 \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bccef82e0b69d276523b201f376cc27c5b678fd6 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml @@ -0,0 +1,16 @@ +mode: "train" +parallel: False # 是否使用并行 +t5_config_path: "prot_t5_xl_uniref50_ms/t5_xl.yaml" # T5 yaml配置文件 +load_model_path: null # 可以配置checkpoint ckpt路径;用于继续训练 + +# 训练相关参数 +train: + train_data_path: 'train_data' # 训练文件目录;训练文件为mindrecord格式 + lr: 2.0e-5 # learning rate + warmup_steps: 0 + batch_size: 32 + epochs: 1 + save_steps: 20000 + save_ckpt_path: "output/" + use_clip_grad: True + max_grad_norm: 1 \ No newline at end of file diff --git a/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml b/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..79fa2059897358da8ac8dec2ca27ba13ecedb240 --- /dev/null +++ b/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml @@ -0,0 +1,43 @@ +model: + arch: + type: T5ForConditionalGeneration + model_config: + attention_dropout_rate: 0.1 + batch_size: 8 + d_ff: 16384 + d_kv: 128 + do_sample: false + embedding_dropout_prob: 0.1 + eos_token_id: 1 + has_relative_bias: true + hidden_act: relu + hidden_dropout_rate: 0.1 + hidden_size: 1024 + is_encoder_decoder: true + layer_norm_epsilon: 1.0e-06 + length_penalty_weight: 1.0 + max_position_embeddings: 512 + num_heads: 32 + num_layers: 24 + offset: 0 + pad_token_id: 0 + post_layernorm_residual: false + relative_attention_num_buckets: 32 + repetition_penalty: 1 + scale_output: true + seq_length: 512 + max_decode_length: 512 + start_token_id: 0 + top_k: 1 + top_p: 0.95 + type: T5Config + use_cache: False + use_past: False + vocab_size: 128 + + initializer_factor: 0.7 + initializer_range: 0.02 + param_init_type: "float32" + layernorm_compute_type: "float32" + softmax_compute_type: "float32" + compute_dtype: "float32" \ No newline at end of file diff --git a/MindSPONGE/requirements.txt b/MindSPONGE/requirements.txt index ae2e689c056f6b53f0a4338e8f78cc82f89e5128..9b940d1b58dbe91fe925d1b48212b4d50e90064b 100644 --- a/MindSPONGE/requirements.txt +++ b/MindSPONGE/requirements.txt @@ -11,4 +11,6 @@ descriptastorus == 2.6.1 pyparsing >= 3.0.7 rdkit bio -scikit-learn \ No newline at end of file +scikit-learn +mindformers >= 1.2.0 +sentencepiece >= 0.2.0 \ No newline at end of file diff --git a/MindSPONGE/src/mindsponge/pipeline/models/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/__init__.py index 3e0858a40e53012d90df47bd48f2bef2ae5d134f..3a01e80770f0372da696b2f99c7f1da816e56252 100644 --- a/MindSPONGE/src/mindsponge/pipeline/models/__init__.py +++ b/MindSPONGE/src/mindsponge/pipeline/models/__init__.py @@ -35,3 +35,5 @@ from .multimer import Multimer, MultimerDataSet, multimer_configuration from .proteinmpnn import ProteinMpnn, ProteinMpnnDataset, proteinmpnn_configuration from .ufold import UFold, UFoldDataSet, ufold_configuration from .rasp import RASP, RASPDataSet, rasp_configuration +from .prot_t5 import ProtT5, ProtT5TrainDataSet, prott5pretrain_configuration +from .prot_t5 import ProtT5DownstreamTasks, ProtT5TaskDataSet, prott5downtask_configuration diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..43112e748230b00ccd53918942163588a5823b48 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Prot T5""" +from .pretrain.pretrain_prott5 import ProtT5 +from .pretrain.pretrain_dataloader import ProtT5TrainDataSet +from .pretrain.pretrain_configuration import prott5pretrain_configuration + +from .downstream.prott5_downstream_tasks import ProtT5DownstreamTasks +from .downstream.task_datasets import ProtT5TaskDataSet +from .downstream.downstream_configuration import prott5downtask_configuration diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e8806c147efb3ea0fff0dda381d56cb3cdacbfb5 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Module""" diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py new file mode 100644 index 0000000000000000000000000000000000000000..0b17a7e3e999b7db110806cc026b3e688d126822 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/deeploc_task.py @@ -0,0 +1,100 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Pretrain property prediction task; deeploc dataset.""" +from mindspore import nn +import mindspore.ops as ops + +from .downstream_nets import MeanPoolingClassifier, EmbeddingTaskNet +from .downstream_task import BaseTask, lr_secheduler +from .task_datasets import create_deeploc_dataset, map_label_to_category, \ + LOC_CATES, LOC_LABEL_TO_CATE, MEMBRANE_LABEL_TO_CATE + + +class DeeplocTask(BaseTask): + """ + Pretrain property prediction task of deeploc dataset. The network is a mean pooling classifier of embeddings. + """ + def __init__(self, config): + """ + This method initializes the network, has train and eval interface. + cate_name represents different tasks: "loc", "membrane" + """ + super().__init__(config) + self.cate_name = config.train.cate_name + + if self.cate_name == 'loc': + self.label_to_cate = LOC_LABEL_TO_CATE + num_classes = len(LOC_CATES) + else: + self.label_to_cate = MEMBRANE_LABEL_TO_CATE + num_classes = 2 + + # apply mean pooling in hidden states of prot T5 model encoder + mpc_net = MeanPoolingClassifier(num_classes) + self.net = EmbeddingTaskNet(mpc_net, self.t5_config_path) + + if self.checkpoint_path: + self.net.load_from_pretrained(self.checkpoint_path) + + if self.train_conf.train_data_path: + self.train_dataset = create_deeploc_dataset(self.train_conf.train_data_path, \ + self.t5_tokenizer, batch_size=self.train_conf.batch_size, cate_name=self.cate_name) + self.loss_fn = nn.CrossEntropyLoss(reduction='mean') + batch_num = self.train_dataset.get_dataset_size() + learning_rate = lr_secheduler(self.train_conf.lr, batch_num, self.train_conf.epochs) + self.optimizer = nn.Adam(self.net.trainable_params(), learning_rate=learning_rate) + + if self.train_conf.eval_data_path: + self.eval_dataset = create_deeploc_dataset(self.train_conf.eval_data_path, \ + self.t5_tokenizer, batch_size=self.train_conf.batch_size, cate_name=self.cate_name) + + @staticmethod + def __eval_fn(model_fn, dataset): + """eval give dataset with model; staticmethod""" + metric = nn.Accuracy('classification') + metric.clear() + for inputs, masks, targets in dataset: + logits = model_fn(inputs, masks) + metric.update(logits, targets) + + accuracy = metric.eval() + dataset.reset() + return accuracy + + def eval_fn(self, dataset): + """eval dataset""" + return DeeplocTask.__eval_fn(self.net, dataset) + + # pylint: disable=W0221 + def forward_fn(self, inputs, masks, targets): + """forward loss""" + logits = self.net(inputs, masks) + loss = self.loss_fn(logits, targets) + return loss + + def eval_acc(self, eval_data_path): + """eval accuracy data file""" + eval_dataset = create_deeploc_dataset(eval_data_path, self.t5_tokenizer, \ + batch_size=self.train_conf.batch_size, cate_name=self.cate_name) + return self.eval_fn(eval_dataset) + + def predict(self, data): + """predict""" + logits = self.net(*data) + softmax = ops.Softmax(axis=1) + probabilities = softmax(logits) + predicted_labels = ops.Argmax(axis=1)(probabilities).asnumpy() + predicted_cates = map_label_to_category(predicted_labels, self.label_to_cate) + return predicted_cates diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_configuration.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_configuration.py new file mode 100644 index 0000000000000000000000000000000000000000..7ffbe7e4084e2c3fb62f0afb5dccb94539ffe347 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_configuration.py @@ -0,0 +1,23 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"prott5 downstream task configure" +prott5downtask_configuration = { + "protT5_base": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml", + "protT5downtask_predict": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_eval.yaml", + "protT5downtask_train": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_downstream_task_train.yaml" +} diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_nets.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_nets.py new file mode 100644 index 0000000000000000000000000000000000000000..5d390561edbc70e5cda199c812e46724d640b229 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_nets.py @@ -0,0 +1,111 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Networks of downstream task.""" +import logging + +import mindspore as ms +import mindspore.nn as nn +import mindspore.ops as ops +from mindformers import T5ForConditionalGeneration + + +logger = logging.getLogger(__name__) +EMBEDIING_LENGTH = 1024 + + +class ConvNet(nn.Cell): + """Acid token level predictor; using convolution net to convergence of local information.""" + def __init__(self): + super().__init__() + # CNN weights are trained on ProtT5 embeddings + self.feature_extractor = nn.SequentialCell([ + nn.Conv2d(EMBEDIING_LENGTH, 32, kernel_size=(7, 1), pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True), # 7x32 + nn.ReLU(), + nn.Dropout(p=0.1), + ]) + + n_final_in = 32 + self.dssp3_classifier = nn.Conv2d(n_final_in, 3, kernel_size=(7, 1), \ + pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + self.dssp8_classifier = nn.Conv2d(n_final_in, 8, kernel_size=(7, 1), \ + pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + self.diso_classifier = nn.Conv2d(n_final_in, 2, kernel_size=(7, 1), \ + pad_mode='pad', padding=(3, 3, 0, 0), has_bias=True) + + def construct(self, embeddings, masks): + """construct: IN: X = (B x L x F); OUT: (B x F x L, 1)""" + x = embeddings * ops.expand_dims(masks, -1) + x = ops.Transpose()(x, (0, 2, 1)).unsqueeze(-1) + x = self.feature_extractor(x) # OUT: (B x 32 x L x 1) + d3_clf = self.dssp3_classifier(x).squeeze(-1).transpose((0, 2, 1)) # OUT: (B x L x 3) + d8_clf = self.dssp8_classifier(x).squeeze(-1).transpose((0, 2, 1)) # OUT: (B x L x 8) + diso_clf = self.diso_classifier(x).squeeze(-1).transpose((0, 2, 1)) # OUT: (B x L x 2) + return d3_clf, d8_clf, diso_clf + + +class MeanPoolingClassifier(nn.Cell): + """Acid sequence level predictor; using mean pooling classifier.""" + def __init__(self, num_classes): + super().__init__() + self.num_classes = num_classes + self.dense = nn.Dense(EMBEDIING_LENGTH, 32, activation='relu') + self.classifier = nn.Dense(32, num_classes) + self.dropout = nn.Dropout(p=0.1) + + def construct(self, embeddings, masks): + """construct""" + masks = ops.cast(masks, ms.float32) + masked_inputs = embeddings * ops.expand_dims(masks, -1) + mean_pooled = ops.ReduceMean(keep_dims=False)(masked_inputs, 1) + mean_pooled = self.dropout(mean_pooled) + compressed = self.dense(mean_pooled) + output = self.classifier(compressed) + return output + + +class EmbeddingTaskNet(nn.Cell): + """Base net of Embedding part for downstream task.""" + def __init__(self, downstream_net, t5_config_path): + super(EmbeddingTaskNet, self).__init__() + self.downstream_net = downstream_net + + self.t5 = T5ForConditionalGeneration.from_pretrained(t5_config_path) + self.t5.set_train(False) + + # freeze pretrain model parameters + for param in self.t5.trainable_params(): + param.requires_grad = False + + def construct(self, inputs, masks): + """construct""" + masks = ops.cast(masks, ms.float32) + embeddings = self.t5.encoder_forward(inputs, masks) + output = self.downstream_net(embeddings, masks) + return output + + def load_from_pretrained(self, config_path): + """load downstream task checkpoint""" + non_pretrained_param_dict = ms.load_checkpoint(config_path) + param_not_load, _ = ms.load_param_into_net(self.downstream_net, non_pretrained_param_dict) + self.downstream_net.set_train(False) + self.set_train(False) + logger.warning("Not Loaded param list: %s", param_not_load) + + def save_checkpoint(self, model_path): + """save checkpoint""" + non_pretrained_param_dict = {} + for param in self.downstream_net.trainable_params(): + non_pretrained_param_dict[param.name] = param.data + ms.save_checkpoint(non_pretrained_param_dict, model_path) diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_task.py new file mode 100644 index 0000000000000000000000000000000000000000..70da086176124edb471cf47febd432ad721dd6e7 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/downstream_task.py @@ -0,0 +1,108 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Base trainer of downstream tasks.""" +from abc import abstractmethod + +from mindspore import nn, value_and_grad +from mindformers import T5Tokenizer +from mindformers.tools.logger import get_logger + + +logger = get_logger(logger_name='DownstreamTask') +PRINT_STEPS = 80 + + +def lr_secheduler(init_lr, batch_num, epochs): + """Cosine decay learning rate""" + lr_max = init_lr # max lr + lr_min = 5e-5 # min lr + decay_steps = int(epochs * batch_num) + lr_sch = nn.CosineDecayLR(min_lr=lr_min, max_lr=lr_max, decay_steps=decay_steps) + return lr_sch + + +class BaseTask: + """ + Base class of downstream tasks with train and eval interface. + + Args: + config.cate_name: the dataset has two subtask with different label name: 'loc', 'membrane' + config.t5_config_path: prot t5 pretrain model directory path. + config.checkpoint_path: the task checkpoint path; use to eval. + """ + def __init__(self, config): + self.mode = config.mode + self.task_name = config.task_name + self.t5_config_path = config.t5_config_path + self.checkpoint_path = config.checkpoint_path + + self.train_conf = config.train + self.checkpoint_save_path = config.train.checkpoint_save_path + self.epochs = config.train.epochs + + self.net = None + self.train_dataset = None + self.eval_dataset = None + self.grad_fn = None + self.t5_tokenizer = T5Tokenizer.from_pretrained(config.t5_config_path) + + @abstractmethod + def eval_acc(self, eval_data_path): + pass + + @abstractmethod + def forward_fn(self, *args): + pass + + @abstractmethod + def eval_fn(self, dataset): + pass + + def train_step(self, *args): + """train step""" + loss, grads = self.grad_fn(*args) + self.optimizer(grads) + return loss + + def train(self): + """train""" + weights = self.net.trainable_params() + self.grad_fn = value_and_grad(self.forward_fn, None, weights) + + logger.info("Begin training...") + for epoch in range(self.epochs): + logger.info("Epoch: %d", epoch) + step = 0 + loss_steps = 0.0 + for inputs in self.train_dataset: + step += 1 + loss = self.train_step(*inputs) + loss_steps += loss.asnumpy() + if step % PRINT_STEPS == 0: + logger.info("loss: %.4f", loss_steps / PRINT_STEPS) + loss_steps = 0.0 + + self.train_dataset.reset() + + logger.info("Training done") + + if self.eval_dataset: + logger.info("Begin eval...") + acc = self.eval_fn(self.eval_dataset) + logger.info("Accuracy: %s", str(acc)) + + if self.checkpoint_save_path: + self.net.save_checkpoint(self.checkpoint_save_path) + logger.info("Checkpoint dumpped successful") diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py new file mode 100644 index 0000000000000000000000000000000000000000..c7c28fbc5f97c377c16e7b6088bb0aeb3378230d --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/hhblits_task.py @@ -0,0 +1,121 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Pretrain property prediction task; hhblits dataset.""" +from mindspore import nn +import mindspore.ops as ops + +from .task_datasets import create_hhblits_dataset, map_label_to_category, \ + LABEL_MASKER, HHBLITS_D3_LABEL_TO_CATE, HHBLITS_D8_LABEL_TO_CATE +from .downstream_nets import ConvNet, EmbeddingTaskNet +from .downstream_task import BaseTask, lr_secheduler + + +class TokenLevelAccuracy: + """Sequence token level classify task accuracy.""" + def __init__(self, num_classes): + self.num_classes = num_classes + self.acc = nn.Accuracy('classification') + self.acc.clear() + + def update(self, logits, labels): + act_labels = labels.view(-1) + act_logits = logits.view(-1, self.num_classes) + valid_labels = act_labels[act_labels > -1] + valid_logits = act_logits[act_labels > -1] + + self.acc.update(valid_logits, valid_labels) + + def get(self): + return self.acc.eval() + + +class HHblitsTask(BaseTask): + """Pretrain property prediction task of hhblits dataset. The network is a convnet and token level classifier.""" + def __init__(self, config): + super().__init__(config) + cnn_net = ConvNet() + self.net = EmbeddingTaskNet(cnn_net, self.t5_config_path) + + if self.checkpoint_path: + self.net.load_from_pretrained(self.checkpoint_path) + + if self.train_conf.train_data_path: + self.train_dataset = create_hhblits_dataset(self.train_conf.train_data_path, + self.t5_tokenizer, self.train_conf.batch_size) + batch_num = self.train_dataset.get_dataset_size() + learning_rate = lr_secheduler(self.train_conf.lr, batch_num, self.train_conf.epochs) + self.optimizer = nn.Adam(self.net.trainable_params(), learning_rate=learning_rate) + self.loss_fn = nn.CrossEntropyLoss(reduction='mean', ignore_index=LABEL_MASKER) + + # eval + if self.train_conf.eval_data_path: + self.eval_dataset = create_hhblits_dataset(self.train_conf.eval_data_path, + self.t5_tokenizer, self.train_conf.batch_size) + + @staticmethod + def __eval_fn(model_fn, dataset): + """eval dataset accuracy.""" + metric_q3 = TokenLevelAccuracy(3) + metric_q8 = TokenLevelAccuracy(8) + + for inputs, masks, d3labels, d8labels, _ in dataset: + logits1, logits2, _ = model_fn(inputs, masks) + metric_q3.update(logits1, d3labels) + metric_q8.update(logits2, d8labels) + + dataset.reset() + m3acc = metric_q3.get() + m8acc = metric_q8.get() + return m3acc, m8acc + + def eval_fn(self, dataset): + """eval dataset""" + return HHblitsTask.__eval_fn(self.net, dataset) + + def eval_acc(self, eval_data_path): + """eval accuracy of data file""" + eval_dataset = create_hhblits_dataset(eval_data_path, self.t5_tokenizer, self.train_conf.batch_size) + m3acc, m8acc = self.eval_fn(eval_dataset) + return m3acc, m8acc + + def token_level_crossentoryloss(self, logits, labels, num_classes, loss_fn): + """token level crossentory loss""" + activate_labels = labels.view(-1) + activate_logits = logits.view(-1, num_classes) + return loss_fn(activate_logits, activate_labels) + + def predict(self, data): + """predict""" + logits1, logits2, _ = self.net(*data) + softmax = ops.Softmax(axis=-1) + probabilities1 = softmax(logits1) + probabilities2 = softmax(logits2) + + # get token index of predict max probabilities + predicted_labels1 = ops.Argmax(axis=-1)(probabilities1).asnumpy() + predicted_labels2 = ops.Argmax(axis=-1)(probabilities2).asnumpy() + predicted_cates1 = map_label_to_category(predicted_labels1, HHBLITS_D3_LABEL_TO_CATE) + predicted_cates2 = map_label_to_category(predicted_labels2, HHBLITS_D8_LABEL_TO_CATE) + return predicted_cates1, predicted_cates2 + + # pylint: disable=W0221 + def forward_fn(self, inputs, masks, d3labels, d8labels): + """multitask loss""" + logits1, logits2, _ = self.net(inputs, masks) + loss1 = self.token_level_crossentoryloss( + logits1, d3labels, 3, self.loss_fn) + loss2 = self.token_level_crossentoryloss( + logits2, d8labels, 8, self.loss_fn) + return 0.5 * loss1 + 0.5 * loss2 diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..1e15413b7cde12176b0481d68252c75b0c1f17ac --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/prott5_downstream_tasks.py @@ -0,0 +1,79 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""ProtT5 Downstream Task class implement.""" +from mindspore import jit + +from ...model import Model + +from .hhblits_task import HHblitsTask +from .deeploc_task import DeeplocTask + + +class ProtT5DownstreamTasks(Model): + '''ProtT5DownstreamTasks''' + + def __init__(self, config): + self.name = "ProtT5DownstreamTasks" + self.mixed_precision = False + + self.config = config + self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/prot_t5_xl.ckpt" + self.checkpoint_path = "./prot_t5_xl.ckpt" + + self.mode = config.mode + self.task_name = config.task_name + + if self.task_name == "hhblits": + self.network = HHblitsTask(config) + elif self.task_name == "deeploc": + self.network = DeeplocTask(config) + + super().__init__(self.checkpoint_url, self.checkpoint_path, self.network, self.name, None, + mixed_precision=self.mixed_precision) + + + def forward(self, data): + pass + + def backward(self, data): + pass + + # pylint: disable=W0221 + def predict(self, data): + return self.network.predict(data) + + def eval_acc(self, data_path): + """eval accuracy of data file""" + if self.task_name == "hhblits": + m3acc, m8acc = self.network.eval_acc(data_path) + print("Accuracy Q3 %.4f; Q8 %.4f" % (m3acc, m8acc)) + elif self.task_name == "deeploc": + acc = self.network.eval_acc(data_path) + print("Accuracy %.4f" % acc) + + def train(self): + """train""" + self.network.train() + + @jit + def train_step(self, data): + self.network.train_step(*data) + + @jit + def _jit_forward(self, data): + pass + + def _pynative_forward(self, data): + pass diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..19f1db1438c56e04052900275fc6731d581263e1 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/downstream/task_datasets.py @@ -0,0 +1,202 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""dataset loader of tasks.""" +import random +import re + +import numpy as np +import pandas as pd + +import mindspore.dataset as ds +from mindformers import T5Tokenizer + +from ..utils.utils import seqs_tokenizer +from ....dataset import DataSet + +LABEL_MASKER = -100 + +# Category Definitions in Data +MEMBRANE_CATES = {'M': 0, 'S': 1, 'U': 1} +LOC_CATES = { + 'Cell.membrane': 0, 'Cytoplasm': 1, 'Endoplasmic.reticulum': 2, + 'Golgi.apparatus': 3, 'Lysosome/Vacuole': 4, 'Mitochondrion': 5, + 'Nucleus': 6, 'Peroxisome': 7, 'Plastid': 8, 'Extracellular': 9 + } +HHBLITS_D3_CATES = {'C': 0, 'E': 1, 'H': 2} +HHBLITS_D8_CATES = {'C': 0, 'E': 1, 'H': 2, 'B': 3, 'G': 4, 'I': 5, 'S': 6, 'T': 7} + + +def reverse_dict(original_dict): + """reverse dict""" + return {value: key for key, value in original_dict.items()} + +MEMBRANE_LABEL_TO_CATE = {0: 'M', 1: 'S'} +LOC_LABEL_TO_CATE = reverse_dict(LOC_CATES) +HHBLITS_D3_LABEL_TO_CATE = reverse_dict(HHBLITS_D3_CATES) +HHBLITS_D8_LABEL_TO_CATE = reverse_dict(HHBLITS_D8_CATES) + + +def map_label_to_category(labels, dct): + """map label to category""" + vectorized_map = np.vectorize(lambda label: dct.get(label, '')) + str_labels = vectorized_map(labels) + return str_labels + + +def seq2array(seq): + """sequence to numpy array""" + return np.array(seq).astype(np.int32) + + +def pad_trunc_addspecial(seq, max_length, sp=LABEL_MASKER, pad=0, add_special_tokens=True): + """pad trunc addspecial""" + if len(seq) > max_length: + if add_special_tokens: + seq = seq[:max_length-1] + else: + seq = seq[:max_length] + + if add_special_tokens: + seq.append(sp) + + padded = seq + [pad] * max_length + return padded[:max_length] + + +def get_task_info(cate_name): + """get task info""" + if cate_name == 'loc': + return LOC_CATES + if cate_name == 'membrane': + return MEMBRANE_CATES + return {} + + +def apply_tokenizer(text, tokenizer): + """apply tokenizer""" + text = re.sub(r"[UZOB]", "X", text) + tokens = tokenizer(text, padding='max_length', truncation=True, add_special_tokens=True, max_length=512) + ids, masks = tokens['input_ids'], tokens['attention_mask'] + return ids, masks + + +def create_deeploc_dataset(file_path, tokenizer, batch_size=32, cate_name=''): + """create deeploc dataset""" + df = pd.read_csv(file_path) + res = [] + df = df.rename(columns=lambda x: x.strip()) + + cate_dict = get_task_info(cate_name) + + for _, row in df.iterrows(): + text, cate = row['input'], row[cate_name] + ids, masks = apply_tokenizer(text, tokenizer) + eles = [seq2array(x) for x in [ids, masks, cate_dict.get(cate)]] + res.append(tuple(eles)) + + random.shuffle(res) + dataset = ds.GeneratorDataset(res, column_names=["inputs", "masks", "labels"]) + dataset = dataset.shuffle(buffer_size=128) + dataset = dataset.batch(batch_size=batch_size) + return dataset + + +def load_hhblits_dataset(path): + """load hhblits dataset""" + df = pd.read_csv(path, names=['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask'], skiprows=1) + df = df.rename(columns=lambda x: x.strip()) + + input_fixed = ["".join(seq.split()) for seq in df['input']] + input_fixed = [re.sub(r"[UZOB]", "X", seq) for seq in input_fixed] + seqs = [" ".join(seq) for seq in input_fixed] + + label_fixed3 = ["".join(label.split()) for label in df['dssp3']] + d3_labels = [list(label) for label in label_fixed3] + + label_fixed8 = ["".join(label.split()) for label in df['dssp8']] + d8_labels = [list(label) for label in label_fixed8] + + return seqs, d3_labels, d8_labels + + +def create_hhblits_dataset(file_path, tokenizer, batch_size=32): + """create hhblits dataset""" + seqs, d3_labels, d8_labels = load_hhblits_dataset(file_path) + res = [] + for seq, d3, d8 in zip(seqs, d3_labels, d8_labels): + ids, masks = apply_tokenizer(seq, tokenizer) + + d3 = [HHBLITS_D3_CATES[x.strip()] for x in d3] + d3 = pad_trunc_addspecial(d3, 512, pad=LABEL_MASKER) + + d8 = [HHBLITS_D8_CATES[x.strip()] for x in d8] + d8 = pad_trunc_addspecial(d8, 512, pad=LABEL_MASKER) + + eles = [ids, masks, d3, d8] + eles_tp = [seq2array(x) for x in eles] + res.append(tuple(eles_tp)) + + random.shuffle(res) + dataset = ds.GeneratorDataset(res, column_names=["inputs", "masks", "d3labels", "d8labels"]) + dataset = dataset.shuffle(buffer_size=128) + dataset = dataset.batch(batch_size=batch_size) + return dataset + + +class ProtT5TaskDataSet(DataSet): + """ProtT5 downstream task dataSet""" + def __init__(self, config): + self.task_name = config.task_name + self.data_path = None + self.dataset = None + self.batch_size = config.train.batch_size + self.t5_tokenizer = T5Tokenizer.from_pretrained(config.t5_config_path) + self.phase = None + + super().__init__() + + # pylint: disable=E0302 + def __getitem__(self): + pass + + def __len__(self): + if self.dataset: + return self.dataset.get_dataset_size() + return 0 + + def set_phase(self, phase): + self.phase = phase + + def process(self, data, **kwargs): + return seqs_tokenizer(data, self.t5_tokenizer, return_tensors="ms") + + def set_training_data_src(self, data_source): + self.data_path = data_source + + def download(self, path=None): + pass + + def data_parse(self, idx): + pass + + # pylint: disable=W0221 + def create_iterator(self, num_epochs, cate_name=''): + if self.task_name == "hhblits": + self.dataset = create_hhblits_dataset(self.data_path, self.t5_tokenizer, self.batch_size) + else: + self.dataset = create_deeploc_dataset(self.data_path, self.t5_tokenizer, self.batch_size, \ + cate_name=cate_name) + + return self.dataset diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e8806c147efb3ea0fff0dda381d56cb3cdacbfb5 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Module""" diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/optimization.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..0392e80125cce74bc616bb0770d59bfee6ddd501 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/optimization.py @@ -0,0 +1,57 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""create optimizer""" +import mindspore +from mindspore import ops, nn +from mindspore.nn.learning_rate_schedule import LearningRateSchedule + +from mindspore.nn.optim import AdaFactor + + +class WarmUpPolynomialDecayLR(LearningRateSchedule): + """Polynomia Decay LR with Warmup""" + def __init__(self, learning_rate, end_learning_rate, warmup_steps, decay_steps, power): + super().__init__() + self.learning_rate = learning_rate + self.warmup_steps = max(warmup_steps, 1) + self.end_learning_rate = end_learning_rate + self.decay_steps = decay_steps + self.power = power + + def construct(self, global_step): + """construct""" + # warmup lr + warmup_percent = global_step.astype(mindspore.float32) / self.warmup_steps + warmup_learning_rate = self.learning_rate * warmup_percent + # polynomial lr + global_step = ops.minimum(global_step, self.decay_steps) + decayed_learning_rate = (self.learning_rate - self.end_learning_rate) * \ + ops.pow((1 - global_step / self.decay_steps), self.power) + \ + self.end_learning_rate + is_warmup = (global_step < self.warmup_steps).astype(mindspore.float32) + learning_rate = ((1.0 - is_warmup) * decayed_learning_rate + is_warmup * warmup_learning_rate) + return learning_rate + + +def create_optimizer(model, init_lr, optim_type, weight_decay=0.0): + """create optimizer""" + if optim_type == 'adafactor': + optim = AdaFactor(model.trainable_params()) + elif weight_decay > 0: + optim = nn.AdamWeightDecay(model.trainable_params(), init_lr, weight_decay=weight_decay) + else: + optim = nn.Adam(model.trainable_params(), learning_rate=init_lr) + + return optim diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_configuration.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_configuration.py new file mode 100644 index 0000000000000000000000000000000000000000..f43d4e4611847951942b8b6dcfbfc9060132a936 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_configuration.py @@ -0,0 +1,23 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"protT5 pretrain task configure" +prott5pretrain_configuration = { + "protT5_base": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_xl.yaml", + "protT5downtask_predict": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_predict.yaml", + "protT5downtask_train": + "https://gitee.com/mindspore/mindscience/raw/master/MindSPONGE/applications/model_configs/ProtT5/t5_pretrain.yaml" +} diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..a60b015b71e5f12a71e3b116fcd8c18fc70c6cc7 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_dataloader.py @@ -0,0 +1,122 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""data loader of mindreord files; add mask, pad and bacth process.""" +import os + +import numpy as np +import mindspore.dataset as ds +from mindformers import T5Tokenizer + +from ..utils.utils import seqs_tokenizer +from ....dataset import DataSet + +MASK_TOKEN_ID = 33 +PAD_INDEX = 0 + + +class EncDecIds(): + """return `input_ids', 'masks', 'decode_ids'.""" + def __init__(self, mask_prob): + super().__init__() + self.mask_prob = mask_prob + + def __call__(self, raw_ids): + decode_ids = np.array(raw_ids) + mask_ids = np.ones_like(raw_ids) + + np.random.seed(23) + # determine which positions should be masked + random_mask = np.random.rand(len(raw_ids)) < self.mask_prob + random_mask[len(raw_ids) - 1] = False + raw_ids[random_mask] = MASK_TOKEN_ID + + return (np.array(raw_ids), mask_ids, decode_ids) + + +def find_mindrecord_files(directory): + """find mindrecord files""" + files = os.listdir(directory) + mindrecord_files = [os.path.join(directory, f) for f in files if f.endswith('.mindrecord')] + return mindrecord_files + + +def create_pretrain_dataset(mr_files, batch_size, epochs, rank_size=0, rank_id=0): + """create pretrain dataset""" + if rank_size > 0: + dataset = ds.MindDataset(dataset_files=mr_files, columns_list=["raw_ids"], + num_shards=rank_size, shard_id=rank_id, shuffle=True) + else: + dataset = ds.MindDataset(dataset_files=mr_files, columns_list=["raw_ids"], shuffle=True) + dataset = dataset.map(operations=EncDecIds(0.15), input_columns=["raw_ids"], + output_columns=["input_ids", "masks", "decode_ids"]) + + # default: pad = 0 + padding_shape = ([512], 0) + pad_info = {"input_ids": padding_shape, "masks": padding_shape, "decode_ids": padding_shape} + dataset = dataset.padded_batch(batch_size=batch_size, drop_remainder=True, pad_info=pad_info) + dataset = dataset.repeat(epochs) + return dataset + + +class ProtT5TrainDataSet(DataSet): + """ProtT5 downstream task dataSet""" + def __init__(self, config): + self.batch_size = config.train.batch_size + self.data_path = None + self.dataset = None + self.phase = None + self.t5_config_path = config.t5_config_path + self.tokenizer = None + + super().__init__() + + # pylint: disable=E0302 + def __getitem__(self): + pass + + def __len__(self): + if self.dataset: + return self.dataset.get_dataset_size() + return 0 + + def set_phase(self, phase): + self.phase = phase + + # pylint: disable=W0221 + def process(self, data, mode="embedding"): + re_type = "ms" + if mode == "generate": + re_type = "np" + + if not self.tokenizer: + self.tokenizer = T5Tokenizer.from_pretrained(self.t5_config_path) + return seqs_tokenizer(data, self.tokenizer, return_tensors=re_type) + + def set_training_data_src(self, data_source): + self.data_path = data_source + + def download(self, path=None): + pass + + def data_parse(self, idx): + pass + + # pylint: disable=W0221 + def create_iterator(self, num_epochs, rank_size=0, rank_id=0): + mr_files = find_mindrecord_files(self.data_path) + self.dataset = create_pretrain_dataset(mr_files, self.batch_size, num_epochs, + rank_size=rank_size, rank_id=rank_id) + data_loader = self.dataset.create_tuple_iterator() + return data_loader diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py new file mode 100644 index 0000000000000000000000000000000000000000..24ec028436f9de300b8d411557731f92ddd6e392 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/pretrain_prott5.py @@ -0,0 +1,224 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""ProtT5 Trainer with data parallel.""" +import time +import os + +import mindspore as ms +from mindspore import nn, value_and_grad +from mindspore.amp import all_finite +from mindspore.ops import functional as F +from mindspore.parallel._utils import _get_device_num, _get_gradients_mean +from mindspore.communication import init, get_rank, get_group_size +from mindformers.core.clip_grad import ClipGradNorm +from mindformers.tools.logger import get_logger +from mindformers import T5Tokenizer + +from .optimization import create_optimizer, WarmUpPolynomialDecayLR +from .pretrain_dataloader import create_pretrain_dataset, find_mindrecord_files +from .t5_modeling import create_model +from ..utils.utils import generate_checkpoint_filename +from ...model import Model + +PRINT_ITERS = 10 +logger = get_logger(logger_name='Pretrain') + + +class ProtT5(Model): + """ProtT5""" + name = "ProtT5" + + def __init__(self, config): + self.mixed_precision = False + self.config = config + self.use_parallel = config.parallel + self.rank_id = 0 + self.rank_size = 1 + self.init_context() + + self.checkpoint_url = "https://download.mindspore.cn/mindscience/mindsponge/ProtT5/checkpoint/prot_t5_xl.ckpt" + self.checkpoint_path = "./prot_t5_xl.ckpt" + self.mode = config.mode + self.train_conf = config.train + + if self.mode == "train": + self.network = create_model(config.t5_config_path, config.load_model_path) + self.init_trainer() + else: + self.tokenizer = T5Tokenizer.from_pretrained(config.t5_config_path) + self.network = create_model(config.t5_config_path, from_pretrained=True) + + super().__init__(self.checkpoint_url, self.checkpoint_path, self.network, self.name, None, + mixed_precision=self.mixed_precision) + + + def init_context(self): + """init context""" + if self.use_parallel: + init() + self.rank_id = get_rank() + self.rank_size = get_group_size() + ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend", device_id=self.rank_id) + ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, parameter_broadcast=True, + device_num=self.rank_size, gradients_mean=True) + + else: + ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + + + def init_trainer(self): + """init trainer""" + if self.train_conf.save_ckpt_path: + os.makedirs(self.train_conf.save_ckpt_path, exist_ok=True) + + # data_loader + dataset_path = find_mindrecord_files(self.train_conf.train_data_path) + train_dataset = create_pretrain_dataset(dataset_path, self.train_conf.batch_size, self.train_conf.epochs, \ + rank_size=self.rank_size, rank_id=self.rank_id) + + self.train_dataloader = train_dataset.create_tuple_iterator() + num_train_steps = train_dataset.get_dataset_size() + + # grad clip + self.use_clip_grad = False + if self.train_conf.use_clip_grad: + self.use_clip_grad = True + self.clip_grad_norm = ClipGradNorm(max_norm=self.train_conf.max_grad_norm) + + # trick: warm up + if self.train_conf.warmup_steps > 0: + lr = WarmUpPolynomialDecayLR(self.train_conf.lr, 0.0, self.train_conf.warmup_steps, num_train_steps, 1.0) + else: + lr = self.train_conf.lr + + # Define optimizer. + self.optimizer = create_optimizer(self.network, lr, 'adam', weight_decay=0) + + # data parall + if self.use_parallel: + degree = _get_device_num() + mean = _get_gradients_mean() + self.grad_reducer = nn.DistributedGradReducer(self.optimizer.parameters, mean, degree) + + weights = self.network.trainable_params() + self.grad_fn = value_and_grad(self.forward_fn, None, weights, has_aux=False) + + def forward_fn(self, input_ids, input_mask, decode_ids): + """forward loss""" + loss = self.network(input_ids, input_mask, decode_ids) + return loss + + def save_checkpoint(self, train_step_nums): + """save checkpoint""" + if self.rank_id == 0: + filename = generate_checkpoint_filename(self.train_conf.save_ckpt_path, train_step_nums) + ms.save_checkpoint(self.network, filename) + + def train(self): + """train""" + loss_total = 0 + cur_step_nums, train_step_nums, skip_step_nums = 0, 0, 0 + cur_time, avg_time = time.time(), 0 + + # step begin + self.network.set_train(True) + + for input_ids, input_mask, decode_ids in self.train_dataloader: + loss, is_finite = self._train_step(input_ids, input_mask, decode_ids) + if is_finite: + loss_total = loss_total + loss.asnumpy().item() + train_step_nums += 1 + else: + logger.warning(f"grads overflow, skip step {cur_step_nums}; loss: {loss}") + skip_step_nums += 1 + + if train_step_nums % PRINT_ITERS == 0 and train_step_nums != 0: + print_time = time.time() + total_time = print_time - cur_time + cur_time = print_time + avg_time = total_time / (PRINT_ITERS + skip_step_nums) + + logger.info(f"avg_time(ms): {avg_time * 1000:2f}, " + f"cur_step: {cur_step_nums}, " + f"skip_steps: {skip_step_nums:3d}, " + f"train_step: {train_step_nums}, " + f"loss: {loss_total/PRINT_ITERS:f}, ") + + loss_total = 0 + skip_step_nums = 0 + + # saving ckpt per N steps or last step + if train_step_nums % self.train_conf.save_steps == 0: + self.save_checkpoint(train_step_nums) + + cur_step_nums += 1 + + self.save_checkpoint(train_step_nums) + logger.info("Pretrain done!") + + + @ms.jit + def _train_step(self, input_ids, input_mask, decode_ids): + """train step jit function""" + loss, grads = self.grad_fn(input_ids, input_mask, decode_ids) + + if self.use_parallel: + grads = self.grad_reducer(grads) + + is_finite = all_finite(grads) + + if is_finite: + # Apply gradient clipping + if self.use_clip_grad: + grads, _ = self.clip_grad_norm(grads) + + loss = F.depend(loss, self.optimizer(grads)) + + return loss, is_finite + + def train_step(self, data): + """train step""" + return self._train_step(*data) + + # pylint: disable=W0221 + def predict(self, data, mode="embedding"): + """predict""" + self.network.set_train(False) + token_ids, attention_mask = data + if mode == "generate": + # Generate the sequence of input texts + output_ids = self.network.generate(token_ids, do_sample=False) + output_tokens = self.tokenizer.decode(output_ids, skip_special_tokens=True) + return output_tokens + + if mode == "embedding": + # Embedding of the final layer of encoder + outputs = self.network.encoder_forward(token_ids, attention_mask) + hiddens = outputs.asnumpy() + return hiddens + + return None + + def forward(self, data): + pass + + def backward(self, data): + pass + + def _jit_forward(self, data): + pass + + def _pynative_forward(self, data): + pass diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/t5_modeling.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/t5_modeling.py new file mode 100644 index 0000000000000000000000000000000000000000..379f5f27d0ee699eb3b5b34e63461f4510855549 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/pretrain/t5_modeling.py @@ -0,0 +1,101 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""T5 model with initial method.""" +import logging + +import mindspore +from mindspore.common.initializer import initializer, Constant, Normal + +from mindformers import AutoConfig +from mindformers import T5ForConditionalGeneration as T5WithLoss +from mindformers.modules.transformer.transformer import default_transformer_config, TransformerOpParallelConfig + +logger = logging.getLogger(__name__) + + +def set_data(weight, init_distribution): + """set data weight""" + weight.set_data(initializer(init_distribution, weight.shape, weight.dtype)) + + +def init_cell(cell, name, config): + """init cell""" + factor = config.initializer_factor + if "layernorm" in name: + set_data(cell.gamma, Constant(factor * 1.0)) + elif "tfm_embedding_lookup" in name: + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + set_data(cell.embedding_table, Normal(factor * 1.0)) + elif name.endswith("output"): + # default these cells has no bias + set_data(cell.mapping.weight, Normal(factor * ((config.hidden_size) ** -0.5))) + set_data(cell.projection.weight, Normal(factor * ((config.d_ff) ** -0.5))) + elif name.endswith("attention"): + d_model = config.hidden_size + key_value_proj_dim = config.kv_size + n_heads = config.num_heads + + # q, k, v, o parameter + set_data(cell.dense1.weight, Normal(factor * ((d_model * key_value_proj_dim) ** -0.5))) + set_data(cell.dense2.weight, Normal(factor * (d_model**-0.5))) + set_data(cell.dense3.weight, Normal(factor * (d_model**-0.5))) + set_data(cell.projection.weight, Normal(factor * ((n_heads * key_value_proj_dim) ** -0.5))) + + if cell.has_relative_bias and cell.is_cross_atten: + set_data(cell.cross_bias, Normal(factor * (d_model**-0.5))) + + +def init_t5_weights(cell, config, prefix=''): + """init t5 weights""" + if hasattr(cell, 'add_name'): + return + + cell.add_flags(add_name=prefix) + init_cell(cell, prefix, config) + + for name, sub_cell in cell.cells_and_names(): + hier_name = prefix + "." + name + init_t5_weights(sub_cell, config, prefix=hier_name) + + +def trans_to_transformer_config(parallel_config): + """trans_to_transformer_config""" + if not parallel_config: + return default_transformer_config + + return TransformerOpParallelConfig(**parallel_config) + + +def create_model(config_path, load_model_path=None, parallel_config=None, from_pretrained=False): + """create model""" + if from_pretrained: + return T5WithLoss.from_pretrained(config_path) + + base_config = AutoConfig.from_pretrained(config_path) + base_config.parallel_config = trans_to_transformer_config(parallel_config) + model = T5WithLoss(base_config) + + if load_model_path: + # load from checkpoint path + param_dict = mindspore.load_checkpoint(load_model_path) + mindspore.load_param_into_net(model, param_dict) + logger.info("pretrain: load ckpt successful") + else: + # init T5 + init_t5_weights(model, base_config, prefix="") + logger.info("pretrain: inited successful") + + return model diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/convert_weight.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/convert_weight.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce778d66c5bd9cdf30463eab33b6f413a480d0d --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/convert_weight.py @@ -0,0 +1,180 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Convert checkpoint from torch/huggingface: source from mindformers""" +import argparse +import numpy as np +import torch +from mindspore import save_checkpoint, Tensor + + +def generate_params_dict(total_layers, + mindspore_params_per_layer, + torch_params_per_layer, + mindspore_additional_params, + torch_additional_params): + """ + Generate the total parameter mapping of mindspore and pytorch. + + Args: + total_layers(int): The total layers of the net. + mindspore_params_per_layer(list): The list of params per layer for the net of mindspore. + torch_params_per_layer(list): The list of params per layer for the net of pytorch. + mindspore_additional_params(list): The list of params outside the layer for the net of mindspore + torch_additional_params(list): The list of params outside the layer for the net of pytorch. + + Returns: + A list of tuple. The first element is the parameter name of mindspore, + the another is the parameter name of pytorch. + """ + mapped_params = list(zip(mindspore_params_per_layer, torch_params_per_layer)) + ms_extend_param_list = [] + torch_extend_param_list = [] + for i in range(total_layers): + for ms_para, torch_para in mapped_params: + src = ms_para.format(i) + tgt = torch_para.format(i) + + ms_extend_param_list.append(src) + torch_extend_param_list.append(tgt) + + mapped_params = list(zip(mindspore_additional_params, torch_additional_params)) + for ms_para, torch_para in mapped_params: + ms_extend_param_list.append(ms_para) + torch_extend_param_list.append(torch_para) + + return list(zip(ms_extend_param_list, torch_extend_param_list)) + +def get_converted_ckpt(mapped_params, weight_dict): + """ + Print the keys of the loaded checkpoint + + Args: + mapped_params(dict): The loaded checkpoint. The key is parameter name and value is the numpy array. + weight_dict(dict): The loaded pytorch checkpoint. + + Returns: + None + """ + new_ckpt_list = [] + # Currently, the ms_extend_param the torch_extend_param is the full parameters. + for src, tgt in mapped_params: + value = weight_dict[tgt].numpy() + is_transpose = "" + if '.o.' in tgt or '.wi.' in tgt or '.wo.' in tgt: + value = np.transpose(value, [1, 0]) + is_transpose = " transposed" + print(f"Mapping table Mindspore:{src:<30} \t Torch:{tgt:<30} with shape {value.shape}" + f"---{is_transpose}") + new_ckpt_list.append({"data": Tensor(value), "name": src}) + return new_ckpt_list + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="OPT convert script") + parser.add_argument('--layers', + type=int, + default=1, + help="The number of layers of the model to be converted.") + parser.add_argument("--torch_path", + type=str, + default=None, + required=True, + help="The torch checkpoint path.") + parser.add_argument("--mindspore_path", + type=str, + required=True, + default="The output mindspore checkpoint path.", + help="Use device nums, default is 128.") + + opt = parser.parse_args() + state_dict = torch.load(opt.torch_path, map_location='cpu') + + ms_name = [ + "t5_model.tfm_encoder.blocks.{}.layernorm1.gamma", + "t5_model.tfm_encoder.blocks.{}.layernorm2.gamma", + "t5_model.tfm_encoder.blocks.{}.attention.dense1.weight", + "t5_model.tfm_encoder.blocks.{}.attention.dense2.weight", + "t5_model.tfm_encoder.blocks.{}.attention.dense3.weight", + "t5_model.tfm_encoder.blocks.{}.attention.projection.weight", + "t5_model.tfm_encoder.blocks.{}.output.mapping.weight", + "t5_model.tfm_encoder.blocks.{}.output.projection.weight", + + "t5_model.tfm_decoder.blocks.{}.layernorm1.gamma", + "t5_model.tfm_decoder.blocks.{}.cross_attention_layernorm.gamma", + "t5_model.tfm_decoder.blocks.{}.layernorm2.gamma", + "t5_model.tfm_decoder.blocks.{}.attention.dense1.weight", + "t5_model.tfm_decoder.blocks.{}.attention.dense2.weight", + "t5_model.tfm_decoder.blocks.{}.attention.dense3.weight", + "t5_model.tfm_decoder.blocks.{}.attention.projection.weight", + + "t5_model.tfm_decoder.blocks.{}.cross_attention.dense1.weight", + "t5_model.tfm_decoder.blocks.{}.cross_attention.dense2.weight", + "t5_model.tfm_decoder.blocks.{}.cross_attention.dense3.weight", + "t5_model.tfm_decoder.blocks.{}.cross_attention.projection.weight", + "t5_model.tfm_decoder.blocks.{}.output.mapping.weight", + "t5_model.tfm_decoder.blocks.{}.output.projection.weight", + ] + + torch_name = [ + "encoder.block.{}.layer.0.layer_norm.weight", + "encoder.block.{}.layer.1.layer_norm.weight", + "encoder.block.{}.layer.0.SelfAttention.q.weight", + "encoder.block.{}.layer.0.SelfAttention.k.weight", + "encoder.block.{}.layer.0.SelfAttention.v.weight", + "encoder.block.{}.layer.0.SelfAttention.o.weight", + "encoder.block.{}.layer.1.DenseReluDense.wi.weight", + "encoder.block.{}.layer.1.DenseReluDense.wo.weight", + + "decoder.block.{}.layer.0.layer_norm.weight", + "decoder.block.{}.layer.1.layer_norm.weight", + "decoder.block.{}.layer.2.layer_norm.weight", + "decoder.block.{}.layer.0.SelfAttention.q.weight", + "decoder.block.{}.layer.0.SelfAttention.k.weight", + "decoder.block.{}.layer.0.SelfAttention.v.weight", + "decoder.block.{}.layer.0.SelfAttention.o.weight", + + "decoder.block.{}.layer.1.EncDecAttention.q.weight", + "decoder.block.{}.layer.1.EncDecAttention.k.weight", + "decoder.block.{}.layer.1.EncDecAttention.v.weight", + "decoder.block.{}.layer.1.EncDecAttention.o.weight", + + "decoder.block.{}.layer.2.DenseReluDense.wi.weight", + "decoder.block.{}.layer.2.DenseReluDense.wo.weight", + ] + + addition_mindspore = [ + "t5_model.encoder_layernorm.gamma", + "t5_model.decoder_layernorm.gamma", + "t5_model.tfm_embedding_lookup.embedding_table", + "t5_model.tfm_encoder.blocks.0.attention.bias_generator.embeddings_table", + "t5_model.tfm_decoder.blocks.0.attention.bias_generator.embeddings_table", + ] + + addition_torch = [ + "encoder.final_layer_norm.weight", + "decoder.final_layer_norm.weight", + "shared.weight", + "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight", + "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight", + ] + + + mapped_param = generate_params_dict(total_layers=opt.layers, + mindspore_params_per_layer=ms_name, + torch_params_per_layer=torch_name, + mindspore_additional_params=addition_mindspore, + torch_additional_params=addition_torch) + new_ckpt = get_converted_ckpt(mapped_param, state_dict) + save_checkpoint(new_ckpt, opt.mindspore_path) + print(f"Convert finished, the output is saved to {opt.mindspore_path}") diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py new file mode 100644 index 0000000000000000000000000000000000000000..93e783f90cb26b6426a63c31f2abd0e453b5514d --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/scripts/trans_csv_to_mindrecord.py @@ -0,0 +1,117 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Read the csv files; transform to mindrecord files to train model.""" +import argparse +import os +import csv +import re + +import numpy as np + +from mindspore.mindrecord import FileWriter +from mindformers import T5Tokenizer + + +# mindrecord schema; data format +SCHEMA = {"raw_ids": {"type": "int32", "shape": [-1]}} +MAX_TOKENS_PER_FILE = 400 * 1024 * 1024 // 4 +STEP_PRINT_NUM = 10000 +STEP_SAVE_NUM = 1000 +file_index = 0 + + +def process_df(row, tokenizer): + """process df""" + text = re.sub(r"[UZOB]", "X", row['text']) + tokens = tokenizer(" ".join(text), truncation=True, add_special_tokens=True, max_length=512) + token_ids = tokens['input_ids'] + + sample = { + "raw_ids": np.array(token_ids, dtype=np.int32) + } + return sample, len(token_ids) + + +def get_writer(output_dir): + "writer" + global file_index + file_name = os.path.join(output_dir, f"data_{file_index}.mindrecord") + writer = FileWriter(file_name, shard_num=1, overwrite=True) + writer.add_schema(SCHEMA, "mindrecord_schema") + file_index += 1 + return writer + + +def converse_file(csv_file_path, num_samples, output_dir, tokenizer): + """read CSV file and transform to mindRecord files.""" + data = [] + current_file_size = 0 + with open(csv_file_path, newline='') as csvfile: + index = 0 + writer = get_writer(output_dir) + reader = csv.DictReader(csvfile) + for row in reader: + index += 1 + if 0 < num_samples < index: + break + + if current_file_size > MAX_TOKENS_PER_FILE: + writer.commit() + writer = get_writer(output_dir) + current_file_size = 0 + + sample, token_length = process_df(row, tokenizer) + + # compute current file size + current_file_size += 4 * token_length + data.append(sample) + + if index % STEP_PRINT_NUM == 0: + print(f"Samples {index} Done") + + if index % STEP_SAVE_NUM == 0: + writer.write_raw_data(data) + data = [] + + if data: + writer.write_raw_data(data) + + writer.commit() + + +def run(file_path, num_samples, output_dir): + """run""" + tokenizer = T5Tokenizer.from_pretrained(args.t5_config_path) + if os.path.isfile(file_path) and file_path.endswith('csv'): + converse_file(file_path, num_samples, output_dir, tokenizer) + else: + csv_files = [os.path.join(file_path, filename) for filename in os.listdir(file_path) \ + if filename.endswith('.csv')] + for cfile in csv_files: + converse_file(cfile, num_samples, output_dir, tokenizer) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--number_samples", default=-1, type=int, + help="Choose maximum process data sample number.") + parser.add_argument("--data_dir", type=str, required=True, + help="Data path to converse to mindrecords; it can file or dir.") + parser.add_argument("--output_dir", type=str, required=True, help="Data path of output.") + parser.add_argument('--t5_config_path', type=str, required=True, help='model name or t5 config path') + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + run(args.data_dir, args.number_samples, args.output_dir) diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/__init__.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e8806c147efb3ea0fff0dda381d56cb3cdacbfb5 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Module""" diff --git a/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0fb525d739d66714d1325fa24edfb66d5a6d9724 --- /dev/null +++ b/MindSPONGE/src/mindsponge/pipeline/models/prot_t5/utils/utils.py @@ -0,0 +1,34 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""some util functions""" +import os +import datetime +import re + + +def generate_checkpoint_filename(checkpoint_dir, model_info): + """get datatime of now to generate filename.""" + now = datetime.datetime.now() + timestamp = now.strftime('%Y%m%d_%H%M%S') + filename = f'model_{model_info}_{timestamp}.ckpt' + filepath = os.path.join(checkpoint_dir, filename) + return filepath + + +def seqs_tokenizer(sequences, tokenizer, return_tensors=None): + """tokenizer; data preprocess; UZOB is rare which are replaced in ProtT5 model""" + sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences] + tokens = tokenizer(sequences, padding=True, add_special_tokens=True, return_tensors=return_tensors) + return (tokens['input_ids'], tokens["attention_mask"]) diff --git a/MindSPONGE/src/mindsponge/pipeline/pipeline.py b/MindSPONGE/src/mindsponge/pipeline/pipeline.py index 8efdf27525e33715a6b9739fab78ca2a4edc21c6..60a41859b82b8bb4a88188d1174f6cfec8408c65 100644 --- a/MindSPONGE/src/mindsponge/pipeline/pipeline.py +++ b/MindSPONGE/src/mindsponge/pipeline/pipeline.py @@ -35,6 +35,8 @@ from .models import RASP, RASPDataSet, rasp_configuration from .models import Multimer, MultimerDataSet, multimer_configuration from .models import ProteinMpnn, ProteinMpnnDataset, proteinmpnn_configuration from .models import UFold, UFoldDataSet, ufold_configuration +from .models import ProtT5, ProtT5TrainDataSet, prott5pretrain_configuration +from .models import ProtT5DownstreamTasks, ProtT5TaskDataSet, prott5downtask_configuration model_card = { @@ -53,6 +55,9 @@ model_card = { "Proteinmpnn": {"model": ProteinMpnn, "dataset": ProteinMpnnDataset, "config": proteinmpnn_configuration}, "RASP": {"model": RASP, "dataset": RASPDataSet, "config": rasp_configuration}, "UFold": {"model": UFold, "dataset": UFoldDataSet, "config": ufold_configuration}, + "ProtT5": {"model": ProtT5, "dataset": ProtT5TrainDataSet, "config": prott5pretrain_configuration}, + "ProtT5Downstream": {"model": ProtT5DownstreamTasks, "dataset": ProtT5TaskDataSet, + "config": prott5downtask_configuration} }