From c54feabaf3b411f0ae9ab9303d78ef096092e94b Mon Sep 17 00:00:00 2001 From: xingwei bao <2122533937@qq.com> Date: Tue, 29 Oct 2024 19:07:17 +0800 Subject: [PATCH] add new model scBERT to mindspore --- scBERT/README_CN.md | 300 +++++++++++++++++++++ scBERT/dataset_finetune.py | 53 ++++ scBERT/dataset_pretrain.py | 47 ++++ scBERT/demo/AnnData.png | Bin 0 -> 44934 bytes scBERT/finetune.py | 245 +++++++++++++++++ scBERT/layers.py | 110 ++++++++ scBERT/performer.py | 425 ++++++++++++++++++++++++++++++ scBERT/predict.py | 129 +++++++++ scBERT/pretrain.py | 303 +++++++++++++++++++++ scBERT/run_distribute_finetune.sh | 9 + scBERT/run_distribute_pretrain.sh | 9 + scBERT/utils.py | 22 ++ 12 files changed, 1652 insertions(+) create mode 100644 scBERT/README_CN.md create mode 100644 scBERT/dataset_finetune.py create mode 100644 scBERT/dataset_pretrain.py create mode 100644 scBERT/demo/AnnData.png create mode 100644 scBERT/finetune.py create mode 100644 scBERT/layers.py create mode 100644 scBERT/performer.py create mode 100644 scBERT/predict.py create mode 100644 scBERT/pretrain.py create mode 100644 scBERT/run_distribute_finetune.sh create mode 100644 scBERT/run_distribute_pretrain.sh create mode 100644 scBERT/utils.py diff --git a/scBERT/README_CN.md b/scBERT/README_CN.md new file mode 100644 index 000000000..0ff47332e --- /dev/null +++ b/scBERT/README_CN.md @@ -0,0 +1,300 @@ +# 目录 + +- [目录](#目录) +- [scBERT描述](#scBERT描述) +- [模型架构](#模型架构) +- [数据集](#数据集) +- [特性](#特性) +- [环境要求](#环境要求) +- [快速入门](#快速入门) +- [脚本说明](#脚本说明) + - [脚本及样例代码](#脚本及样例代码) + - [脚本参数](#脚本参数) + - [训练过程](#训练过程) + - [单卡训练](#单卡训练) + - [分布式训练](#分布式训练) + - [微调过程](#微调过程) + - [微调](#微调) + - [python命令启动](#python命令启动) + - [shell脚本启动](#shell脚本启动) + - [推理过程](#推理过程) + - [用法](#用法) + - [结果](#结果) +- [随机情况说明](#随机情况说明) +- [ModelZoo主页](#modelzoo主页) + +# scBERT描述 + +细胞类型的可靠注释是单细胞RNA测序数据下游分析的前提条件。现有的注释算法通常面临批次效应处理不当、缺乏精心筛选的标记基因列表,或难以利用基因-基因之间潜在相互作用信息等问题。受大规模预训练语言模型的启发,我们提出了一种基于预训练深度神经网络的模型scBERT(单细胞双向编码器表示转换模型),以克服上述挑战。scBERT采用了深度学习领域的预训练和微调的最新范式。在scBERT的第一个阶段,它通过在大量未标记的scRNA-seq数据上进行预训练,获得了对基因-基因相互作用的广泛理解。然后,经过预训练的scBERT可以通过监督微调用于对未见的和特定用户的scRNA-seq数据进行细胞注释任务。更多信息请参考 [https://www.biorxiv.org/content/10.1101/2021.12.05.471261v1](https://www.biorxiv.org/content/10.1101/2021.12.05.471261v1)。 + +# 模型架构 + +本模型主要包含以下组件: + +1. Gene2vec位置编码模块: + - 使用预训练的基因向量进行编码 + - 维度: 200 + +2. Performer核心编码器: + - 多头注意力机制: 10个头 + - 前馈网络层 + - Layer Normalization层 + - Dropout正则化 + +3. 预训练任务: + - 掩码语言建模(MLM) + - 掩码概率: 0.15 + - 替换概率: 0.9 + +4. 下游分类头: + - 全连接层 + - ReLU激活 + - Dropout层 + +# 数据集 + +使用的数据集:[panglao_10000 Zheng68k](https://drive.weixin.qq.com/s?k=AJEAIQdfAAozQt5B8k) + +1. 预训练数据集: +- 名称: Panglao scRNA-seq数据集 +- 格式: H5AD文件 +- 路径: ./data/panglao_10000.h5ad +- 特征数: 16906个基因 +- 数据大小: 99.3MB + +2. 微调数据集: +- 名称: Zheng68k scRNA-seq数据集 +- 格式: H5AD文件 +- 路径: ./data/Zheng68k_prepeocessed.h5ad +- 特征数: 17053个基因 +- 细胞类型数: 11 +- 数据大小: 262MB + +支持的数据集:panglao_10000 Zheng68k 或者与 AnnData 格式相同的数据集 + +- 目录结构如下,由用户定义目录和文件的名称 + +![image](demo/predict-demo.jpg) + +- 如果用户需要自定义数据集,则需要将数据集格式转化为AnnData数据格式。 + +# 特性 + +1. 分布式训练支持 + - 数据并行(Data Parallel) + - 流水线并行(Pipeline Parallel) + +2. 动态学习率 + - 使用ExponentialDecayLR进行学习率调整 + +3. 混合精度训练 + - 支持FP16训练 + +4. 昇腾硬件适配 + - 支持昇腾910训练推理 + +# 环境要求 + +- 硬件(Ascend) + - 使用Ascend处理器来搭建硬件环境。 +- 框架 + - [MindSpore](https://www.mindspore.cn/install) +- 如需查看详情,请参见如下资源 + - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore.html) + +# 快速入门 + +- 通过官方网站安装Mindspore后,您可以按照如下步骤进行预训练和微调 + +```shell +# 单卡训练 +python pretrain.py \ + --data_path='./data/panglao_10000.h5ad' \ + --epoch=100 \ + --batch_size=4 \ + --learning_rate=1e-4 +``` + +```shell +# 通过shell脚本进行8卡训练 +bash run_distribute_pretrain.sh +``` + +```shell +# 单卡微调 +python finetune.py \ + --data_path='./data/Zheng68k_prepeocessed.h5ad' \ + --model_path='./ckpt/ckpt-0.ckpt' \ + --epoch=100 \ + --batch_size=1 \ + --learning_rate=1e-4 +``` + +```shell +# 多卡微调 +bash run_distribute_finetune.sh +``` + +# 脚本说明 + +## 脚本及样例代码 + +```text + |----README_CN.md + |----ckpt + |----data + |----demo + | |----AnnData.png + |----run_distribute_pretrain.sh + |----run_distribute_finetune.sh + |----pretrain.py + |----finetune.py + |----performer.py + |----dataset_pretrain.py + |----dataset_finetune.py + |----layers.py + |----utils.py +``` + +## 脚本参数 + +train.py中主要的参数如下: + +```text + +--enable_pipeline 是否启用流水线并行,默认为True +--device_id 设备ID +--bin_num 分箱数量,默认值:5 +--gene_num 基因数量,默认值:16906 +--epoch 训练轮数,默认值:100 +--seed 随机种子,默认值:2021 +--batch_size 批次大小,默认值:4 +--learning_rate 学习率,默认值:1e-4 +--valid_every 验证间隔,默认值:1 +--mask_prob 掩码概率,默认值:0.15 +--replace_prob 替换概率,默认值:0.9 +--pos_embed 是否使用Gene2vec编码,默认为True +--data_path 数据路径 +--model_name 模型名称,默认为panglao_pretrain +``` + +## 训练过程 + +### 单卡训练 + +在Ascend设备上,使用python脚本直接开始训练(单卡) + + python命令启动 + + ```shell + # 单卡训练 + python pretrain.py --device_id 0 + ``` + +### 分布式训练 + +在Ascend设备上,使用shell脚本执行分布式训练示例(8卡) + +```shell +# 通过shell脚本进行8卡训练 +bash run_distribute_pretrain.sh +``` + +```log + + 上述shell脚本将在后台运行分布式训练。您可以通过training_log.txt文件查看结果。得到如下损失值: + + ```log + + ... + == Epoch: 1 | Training Loss: 0.029950 | Accuracy: 17.1237% == + == Epoch: 1 | Validation Loss: 1.671589 | Accuracy: 0.0000% == + == Epoch: 2 | Training Loss: 0.022785 | Accuracy: 32.4212% == + == Epoch: 2 | Validation Loss: 1.253894 | Accuracy: 3.1250% == + == Epoch: 3 | Training Loss: 0.017635 | Accuracy: 61.4334% == + == Epoch: 3 | Validation Loss: 0.898995 | Accuracy: 75.6098% == + ... + +``` + +## 微调过程 + +### 微调 + +#### python命令启动 + +```shell + +python finetune.py \ + --model_path='./ckpt/pretrain-99.ckpt' \ + --data_path='./data/Zheng68k_prepeocessed.h5ad' + +``` + +#### shell脚本启动 + +```shell + +bash run_distribute_finetune.sh + +``` + +```text + + == Epoch: 1 | Training Loss: 2.027127 | Accuracy: 28.5007% == + == Epoch: 1 | Validation Loss: 1.894380 | Accuracy: 0.300657 == + == Epoch: 2 | Training Loss: 1.293512 | Accuracy: 54.2020% == + == Epoch: 2 | Validation Loss: 0.852387 | Accuracy: 0.695179 == + == Epoch: 3 | Training Loss: 0.617621 | Accuracy: 78.1191% == + == Epoch: 3 | Validation Loss: 0.685155 | Accuracy: 0.738422 == + == Epoch: 4 | Training Loss: 0.395844 | Accuracy: 86.8700% == + == Epoch: 4 | Validation Loss: 0.698182 | Accuracy: 0.741563 == + == Epoch: 5 | Training Loss: 0.249119 | Accuracy: 92.2498% == + == Epoch: 5 | Validation Loss: 0.716395 | Accuracy: 0.756903 == + == Epoch: 6 | Training Loss: 0.163563 | Accuracy: 95.0767% == + == Epoch: 6 | Validation Loss: 0.801939 | Accuracy: 0.752739 == + +``` + +## 推理过程 + +**推理前需使用finetune.py文件生成的模型检查点文件。** + +### 用法 + +执行完整的推理脚本如下: + +```shell + +python predict.py + +``` + +### 结果 + +推理结果保存在当前路径,通过prediction_log.log中看到最终预测结果。 + +```text + +2024-10-29 15:20:13,565 - INFO - Predictions: ['CD19+ B', 'CD19+ B', 'CD19+ B', 'CD19+ B', 'CD19+ B', 'CD19+ B', 'CD19+ B', 'CD19+ B', 'CD19+ B', 'CD19+ B'] + + +``` +# 随机情况说明 + +在训练中存在以下随机性来源: + +1. 数据集随机切分 +2. 随机初始化模型参数 +3. Dropout随机失活 +4. Performer中的随机投影矩阵 +5. 训练数据的随机打乱 + +为了确保结果可复现,我们: +- 使用固定随机种子(--seed参数) +- 使用确定性计算模式 + +# ModelZoo主页 + +请浏览官网[主页](https://gitee.com/mindspore/models)。 \ No newline at end of file diff --git a/scBERT/dataset_finetune.py b/scBERT/dataset_finetune.py new file mode 100644 index 000000000..618da8afe --- /dev/null +++ b/scBERT/dataset_finetune.py @@ -0,0 +1,53 @@ +import mindspore +from mindspore import Tensor +import numpy as np +import scanpy as sc +from sklearn.model_selection import train_test_split +import pickle as pkl + +# 微调用的带有标签的类型 +class SCDataset: + def __init__(self, data, labels, n_class): + self.data = data + self.labels = labels + self.n_class = n_class + + def __getitem__(self, index): + full_seq = self.data[index].toarray()[0] # 假设输入data是稀疏矩阵格式 + full_seq[full_seq > (self.n_class - 2)] = self.n_class - 2 + full_seq = np.append(full_seq, 0).astype(np.int32) # 添加额外的类别 + label = self.labels[index] + label = np.array(label, dtype=np.int32) + return Tensor(full_seq), Tensor(label) + + def __len__(self): + return self.data.shape[0] + + # MindSpore特定: 转换为MindSpore数据集 + def to_mind_dataset(self, batch_size=32, repeat_size=1): + def generator(): + for i in range(len(self)): + # yield self[i], + data, label = self[i] # 假设 self[i] 返回一个 (data, label) 元组 + yield (data, label) + + # 创建数据集 + types = [mindspore.int32, mindspore.int32] + c_names = ["data", "label"] + ds = mindspore.dataset.GeneratorDataset(generator, column_names=c_names, column_types=types) + ds = ds.batch(batch_size).repeat(repeat_size) + return ds + +def load_data(data_path, n_class, seed, batch_size): + data = sc.read_h5ad(data_path) + label_dict, label = np.unique(np.array(data.obs['celltype']), return_inverse=True) + with open('label_dict', 'wb') as fp: + pkl.dump(label_dict, fp) + with open('label', 'wb') as fp: + pkl.dump(label, fp) + data = data.X + X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.1, random_state=42) + train_dataset = SCDataset(X_train, y_train, n_class).to_mind_dataset(batch_size=batch_size) + val_dataset = SCDataset( X_test, y_test, n_class).to_mind_dataset(batch_size=batch_size) + print("load data success, train num is {}, val num is {}".format(len(train_dataset), len(val_dataset))) + return train_dataset, val_dataset \ No newline at end of file diff --git a/scBERT/dataset_pretrain.py b/scBERT/dataset_pretrain.py new file mode 100644 index 000000000..e87aa612c --- /dev/null +++ b/scBERT/dataset_pretrain.py @@ -0,0 +1,47 @@ +import mindspore +from mindspore import Tensor +import numpy as np +import scanpy as sc +from sklearn.model_selection import train_test_split +from mindspore.communication import get_group_size, get_rank + +class SCDataset: + def __init__(self, data, n_class, seq_len): + self.data = data + self.n_class = n_class + self.seq_len = seq_len + + def __getitem__(self, index): + full_seq = self.data[index].toarray()[0] # 假设输入data是稀疏矩阵格式 + full_seq[full_seq > (self.n_class - 2)] = self.n_class - 2 + full_seq = np.append(full_seq, 0).astype(np.int32) # 添加额外的类别 + return Tensor(full_seq[:self.seq_len]) + + def __len__(self): + return self.data.shape[0] + + # MindSpore特定: 转换为MindSpore数据集 + def to_mind_dataset(self, batch_size=32, repeat_size=1, DP=False): + def generator(): + for i in range(len(self)): + yield self[i], + + # 创建数据集 + types = [mindspore.int32,] + if DP: + group_size = get_group_size() + rank_id = get_rank() + ds = mindspore.dataset.GeneratorDataset(self, column_names=["data"], column_types=types, num_shards=group_size, shard_id=rank_id) + else: + ds = mindspore.dataset.GeneratorDataset(self, column_names=["data"], column_types=types) + ds = ds.batch(batch_size).repeat(repeat_size) + return ds + +def load_data(data_path, n_class, seed, batch_size, seq_len, args): + data = sc.read_h5ad(data_path) + data = data.X + data_train, data_val = train_test_split(data, test_size=0.1, random_state=seed) + train_dataset = SCDataset(data_train, n_class, seq_len).to_mind_dataset(batch_size=batch_size, DP=args.enable_dp) + val_dataset = SCDataset(data_val, n_class, seq_len).to_mind_dataset(batch_size=batch_size, DP=args.enable_dp) + print("load data success, train num is {}, val num is {}".format(len(train_dataset), len(val_dataset))) + return train_dataset, val_dataset \ No newline at end of file diff --git a/scBERT/demo/AnnData.png b/scBERT/demo/AnnData.png new file mode 100644 index 0000000000000000000000000000000000000000..e6aa27577a0d28323dcadf6deffab18faf0dbf1c GIT binary patch literal 44934 zcmeFZcUV(-*ESl*aTrBlupxZ}6%hfc5kgS`QE4Jgz<`Q?5Q?}!Zq)IO#M1;^GQUigI^V>Mj^F81B{yFDd@B7zzxvm%_W$(TETKB#7iyQjd90&Lg zz+f0N@s-bb---`xlP|F_N6n|d&q-)R^u zlpRsTs#IsS&XaW6)S^9)ApVvqbjEuF*fGP56B z^yfSnb{Q&2B=lVwybIUUD@5vj3`%wgi$@>2RP1@gSmoqFpNkidU;XPyXyV;aCFahZ z(8)yK$%hT~4P(q8qR$r(;gT(X^{|i6=NMMzdtmkGgt)lMlOmiQo`NvJTf@D*@+W(= z>Pt={dZ0^%%{#iwAz`rkw5*4NU5QeA{?_WtFq1|8HXrl*Sr`m)!_>t}U^PzI7Jo1V zgSmw!!R5efW#=~cfX`mN`t~>Q^y<~cU8$XT1WEbg<=>8ir;w-6=Ycf#198J(hYsaI zPbW{__@58{cMIo+OAb3Y29nIpeEDNeOt^b-qbkiu($pny=4Wkrb>w)6Xjs1GbF0KL*AB1RB4?PsHcQa?XPmV-(KUZ0WqahqdX~nIB2muiheccmM-Xb~AJR5Px=y(*(RPku zOY-M`4%ZUh$M+Ve@pH>~QyCxDC+@n9b+P3H?#yT~nxF?b<`*;jSTEjT*!&`7lp#Vt zKgU!qL{@T04l8E_8@5b=OL6sXudgNA42w|)9Nu(WfFr+I|4OgqCrI!JdEFU4e~OF{ z?2;ZDgb_p`6J8CmuDv1+SFdQ#kl3oF*7qS zSr^}J->*pOjSIgev)Pim??2_!~Q}cG>_#Jwb^TTe#W8zKv!rj3uOQwIUFQm#QpA=!br}rz9m|`N`e+aVS z9AZduuBuBG&)e;SH&!z~2b#ms_P-e6>?f*QQoaFvtCK1=G-CLNM8e7-4XscbyZMDT=_TL$Vj#)1O-{1fW~YwFUmF z3Ku@1yJzt0238iBM(J~_Kt!$Q_9gKqHI=O3?d7;-bZcPH=5^oa;{siEXKDv>zB+%N zOp^d&BbDTSt+e5E7??j6vY-KlN>wLnXwn;kJ5qpBFz{83i=r-MU3U7^&*=i*g=g9c z6^Qj>^}7DCL+>Kd(|g`dnJ@2!!AO@#SIV_lrNe?Z>pzM&1#Py3={Ifsd=(^~&Z@bu z_x$d4%J%JYBN+N`=%St2AS*F;xt=Q1?MF|C^!wrzN?9;7uN~JUq1cP|S*R4_>apK< zhYgCz_2^|EF-vask%;1$o_(@2mtWITgER@nNqYusH3wm^($ofC{h;=#-YEg)dwH>af0^IouxMJkTg@EpC7~ zJlUsU;?OiwnqQ0DD9hlNvx99sFVf`!S0&cm(qeIln~g`%?e?$iCV+MmKK{Ww?tSF~sq)@aaaf|*|70+b$p3rO`>zZ(*dyOJV2l49 zBokX(JUFK6B~#P>7|ewWJzg-+I}S!hJ+E|m_7D!~1VOdMp|S}!*d4YD^B7D~O6^f1SIz2s-2vc-uu`K!OL|*9{kl$? zAGRXPp`pgZwb@e@dZoAw)KLE6GKo^qk^I8`_j;&0Q7Qy!8A#pEztVcC=fNWE8@U21uhKKoN%|~&am~4y zXgh+zs6D|l^ycto?k^=QBE42e*XE|e9rMoer;+_{_@aZZ`R?o-PS>HQadZ1_r74x$ zZML)eZyBlO(cenAF7^d)7sj!R1s3Ml-ot|9QJ+002cWbmMltjfq!(*{L?!jqHI^^9 zj#uHxpg~9eNxcv6f|&D$(Zj;o@>t3T?U#`$3}XCVuemhdTS~z{!Y~-{HP&zi<0J4s zvmgfZcOA z$JIhL@bXH&47XuYoaJpp`Q@FJVYSTJH3{s4TE*-f+yK0e9>^_oTB7Pg(yWAth=@eh zhQZM0lNSDhme4Ad!p>J;jVx$E#zdcG9!E3%)ZT(L45mBf6E}NURN!hEj&{>psV>iP z?3A~Be|$nkv4l6a%Yz}Lz)V{Bi3Og0$gXyM^ll-%mji>jFq?kB%9rv;>u7N&%UR2I zsiw|MSdiOoY-#Ux(kQDfChXD6f&PPmRGNu4*TR7R(g~)YG@t*5+RJX$xy9zB%mwyxqz| z=B&t%tS2wiXYxug7;`KUKHpxc&~OY}%&S~sP5Y9`HM^xtqGeDCe8X7!53?OBuhpV~ z{s>jSJ0B^hqY%@qASLgpW|9dt=S;ui^qi)&3EY_{yj& zHqO49$v3jS-X7wPj_ZND)A49wN>9iQQnz=?*NMKsglio@Ois2X*nkTj)*HoPK<{m9gR%( zFLCk>vZ`~7QZ$Z>c(5r|wxSb6zG*Z^?;z3JzCz9bryx#CR8)Ht!IaMt~$wzh_ec`p$mm+>UzK652=(YZ)cT^&(3G2qK=R85LA zx@|~b%pB({ICju=b4NL8-(DDjkccIE59$S*yV+{q-WeKSM^HVQXz9z(w}qM2+ar*C zRYhEq-G-W_D+PifLC)DpCD%B59QW`ov~=MI%86kqc@M4?H*J!_T7IBdrK``m-8Odu zS4v)D9(d54x9IJbOqQC*#zb*o!ctWyi-P$O@StvV;ELusD@(bowGA6Cl7D|+Zd_Jd zS)kpuFZKMavLCYrdK~+lDVYmv1KTcN>)N*b`#OhJ)^W_O$zm6#)H9;Hi{H--QLGCS z8OWsTxIS13Cy9uZFupy4=dxS7_%mqZ!bx@OML30{>n-`U^I3ac47|DOJF=SMw~dKG zUrg5K(zh<=SYz({344Mnlr4ZkJxT8^!Ly#u!b|v$a@+WEy!mV2^U(_5qDy94)b0I1 z_jh!Gb7#7c<$5gGPD+MFCJ6!cC`MH-cMN{Ge_iwQo5|9mFoCGMC)kD_t$;N_%ZuYi8kYtEp*Kw-hx&Q(vkMOX#@$?MCI_ zwrpAP=T8Bv%KX`!ZyD)&dpR`^spO^0WDFTrEhavaq&Ry#G&;oHW#8`CI)b^`DGYRR8d6dcq`R9)UzuHmfn?N2A+6*H3~3buF*xz})u$6LvAZP&q52**s# zoE`VB7Q^54Aoj^WIVpE?y&8VgQP5?LtN;;FDYWE?Qx)+%DRJ~;47{b6@+{KNYmshRd5&8RNabtc=# zsJwD78VK3X1wO-!#~kmuWn^`T8;f4AfimzU?VN3r^UA?Dk`%pUd(DNGd~R-=q`E@1 zs+D6rC4ib!88tfCjmlQKJv6I+JJw77_%;loy2)g@YV7fzKCo2Up&p-uLO;}5*&xDXH zkk)=3NOsqv;ogURHdw5spdibej+5ggZ7pF_P0Ofoj@>X2z{m^YB7c*k1(9qsxjry^ zv|X))TCCB9 zdza=qB_C;%k}Qm+cRF|!+matPKyM?OSu7iSU}Gm~xlSAX{f5>u{+{w|#}!&#BU?5P zS8YE?``QDEpc%?(O3xa2G&LPGOon(0kLf4jT{?cMn-YVlM`s+_LdeZc1pl$YhV?Yw z~R9D1qoE7!=|s!g|1Xl2OxFJm;+1aKA4Uzt_Tgt9I|pv%^vbVS@rmS%lh3NOY+TtD%LRQnD;! zGB5b-$e|qhU^cha`p1}!rgLv`lT6}l9w}{eYXiiEk>#lvU?Zo2tmSjsfO(o zXfWd~Rg`B3Ivu=g?o7v8I|a-&h7HGgn<^?eoe67~lWXk7rmGhG$aZ5tz)4)16)b(C zH|w6#-kR!Cg!Hv1c&HYGN-$Eta*N~2(t9~+lok32H@Du}QDJh0A43~noTm29eJm^X z^heNR`9URAI;#&=2x8fn`Kq&5v_FSKF$9Z{6Oj@Qd`Jn_hl^%p&&+|y^St8sGwpKn z$_k9{wdKO|%LT<%!Ia1Z&TMwS-O?KeJAJkw|WMN@^6sb=o*1zU)kQQl~~ifEAAH z&B-TsqwTa$Gh6q*tPSe1Hmz{6A{q6&E<}iNh+Az!5qNaHjba9uuEqxdyYYE2I!x-Q zMvdLlb9jV&bkVjGXJRB?zM|6tr+qUz1{5XU8f4K~(A*(hQL{jdP@r~oy$5ZMc)1=p z$(_qOe|{TLr&eP=lw{v^(&o5Rl>jQgG~V{|o4_2DmKhOSO^Duj*M(AXP+N7DSgYsC zi!{oEqQA=VJSo3m09%uhz2vmA-WpXoiu3K6ojJyIY^X1rcpnwZ3lS)~1>|*3qftSZxYp z8$_W{&CgVjV7$XsooIt~uGljzosn2pkOQcj$|W0UecN^FPCIEkn?|Brs=ZxboU0zV z>~x5!gBbQ5ge0LL5DcgL;=J~P6nEPT$n%IMJh}U~``4rPohANp`=J4L56{T!86J_u z8ltw?4(YYk-qX5R^B|9Fn&CqqCJ>hBO@{(3CO)))+i(~ZxojdX`qfqMaVm<%X+p$5 zbTOEA#6Sg@IH6cC=vv!0@!BAFygqBi2gh#=YD=S3ub^q|kMuP^M;CQi zQe#6hljBoE6s0yhCLYM`&4t>YW-@9urJutOWjf;4*m%n5G$_$*D@NOMea+s1u+(zR zIf#VR5{0{3wwKtg;<)xWaRoPf?r~b=m!;86WI&lQ(|-4p^RB(IDyvJf9USjsQ#RHU zpBQx%58p_b8JNa_1|U4Lopi3LPxyN3wuL#Vw0*peR-X|DYSJhS<_ikN>A;`ld4Cc` z6T}4@j*@7I(!30%Tjspna!GsNwu&m!tF8?UaU~I)iKGecNeJ<$w!sZ4#c}JJSF%F1%V!1!p*{Jq%zeZze{MqGEK^{GhqlQ)M zT|$BSJN4I$lD*Z=n#1ikgjd!D=?{!FwA{csWPT$*G=EPoX|7#afi9Nx!NrW+4Sa;c~JXd`r=d%A53){9F3})f9pw-3l^r} z*;**3#>ZH*U%aJib`ERoX)9jX-wY3G^>s>{zk> zFyHGT%M4tUiex3|n!yOibbNTiGnxMJo8OHeiN_Dvr9#u zHR`TcUrzho7T)IzOdmXFo;KIH6#C%YoJ%37eKzaVHtgmpRc6NhicWKf5(Ql@8jj8a z7k=;KtQRmgIsoDX@m^o3o~m-|Bq}#f%0m{v7pHL41S$U z=eefSqIOT&Tvyw`D5A*sS|=60W2^O1ow71K8$TDk_6eb3F}s*A7!AYmKsn;UM#bE0 zzp95lWIueURvplcwyZEBl84Z|`lF*_#YLbJ@dbtI(P}MGi8+a>PKQP9LqkNiI9ID$ z*)sA*adMF&2$0_E;j>>To0q-kI_BtI>eL0VPqVv@55_!g4*Bob6a-TxROTlw!B-fd ztXZG(!1wMg#=82_`kTV8jHWMFcfO+Og6Q)>Nuuh5$PZzJgkop^rdE3xq5~Ab(l0R0 zl0vHX9tLqQ#A!X-$lFHH#KUDF(_Rq_X0)9ay~>rYRx5k3Vqo?94^}@0+N*;ik0gR~ z${pz#(3kN2ZY^|6-g9X`LF$P_vLz6=1{PeU&sG2*ksTW zj+r<+o_W9&f~x&-XdHs9!A(9VAD_fYsWq*oU)V?jJ?lNFr_cajw}mdOpZrz4_FG;} z_uWv#vfLL7tX1V{Qj#3tJg0_jxX$6{>(qjNcFG~;VEN}Z?^b;Sql&{tDBtrv z)ZbvAm!2@Jl4kgz>EZw9pF!k(=hpVuVjz#_Si?dh^%7;NJc%V^lgX|-y%S$w=ibM2 ze)#^FeUZpqC$SJlo%F5KckHW*>s<59W!6?fAtaWKF!P*9rOdtHOAlE7`q8zHxxR;Y z!|!eCh;wyyK|#TEBfsnRwH)i;Le?2;b8xswQ1pzlh=`)o;6u+GR0!q+{54nIB7W|) z_UDg@JBMJ(rSg8?=^S1t`+%y2_bC~ftXwg6Ra(tT8u>2efaiB6Etuts6Th)19RHjBGb!c#!zxHBF+;lXw1- zl|Mo13hSX(PV3e8lr6TVcT>XPJunfGdrPiXkyTUYIy`wTsV$vr5zO_k;P?5vwOcD; zgSUE)aH}5AJ2ElZ2r`2$gA5Mn$E29W%p8J=-1B%_CP4_eY?z~ShiiqLZsxREeF4n> zVSoP%;ZExQt+ZX4#EEVU=gpKu3eW#(pt@%Ke9mWfcGfR&qd~aC^96w9X60}AQsd0R z)R$fuY}aP4c#pU>jV1;G>*z$6{yH%iA zy}0T2+L~^{KG^D87q+wGoC3$y8TCuzO*<=J62`^YdF&X1u)!ArKTFlb(|NX$KReQ< zLSBbn*|B16RDQcf0k#`L26JD?3jc4I{TcdFld$k`*Vm~uf0qX_A7O*4fwM;G_)XW~ zEt1Skhk~M_owOzHT+gXbXt+&dxTsLT&yVE@W!98BYxAk@z@M=B3)R*v#(=BO(ifi2 z(e-z*!Bj~kepMr}WRL7Z2ZFX8l!F3_;3SVy*?@QMcJC%qlvdBZQ(!E3A`2V5q z{~%Pr63|wn)qloJE8MvZM%nmExkh`u-SoBHaBv6(05>6Sn=*f{$fA)SoLJO;izL3Y z47>mJj@CuhJQb1a0HVssugz$fo0YKB*3>lBUkcS7IH-CJ62irM#ar);JJOGP(K>XM zC^t-wnP9CB0X$DrGj?)~UtP3ELBpUs_873f;uvU+z}`S_ht5H~ke zw{NlofX2fj(te7|A`J}Ml&0!7eCCNW^pN5czm5#9!N(dj$-`S7biTDr%!kuGv^|y@Mi$_+xa+3+*5x{bHi?jw z>rKT~+-I}En9WR66R1TK4D>+hDhL|5E^z&ztf1$$1Ki1XD-w~rz=YsPkoN{HcBPKH z-dC6~!VL}WCz^$Qt`Szi>xN3Mm*tHmF=`sMa$*o90O>#+;@`kt4dy7wMLsCNp6O_r zNmkT-F{pyd26JD~=Mk1WbYjA}5QtsY+6x8HaYkWe^f>r`DGTV*VyDVA2NjyO%US#T zeNE7~9gwT$XVB4H<1YZ6tt#kyz*KyG`%){Y>sC*yLXx_AuIj;kYN+p2N4~YX80c@+ zau&(jqPY(>Cyqp7LJP_s6qyTwvzAOqSm!%G_6U6b5}t3;oJ~A!`EF#kb$@mA;qT9fLpQFYkUaI#id;d9PavKKlc&YkX|0x zopI}S&(v={y9_+1*SmcNWs-W`3)54_ch z>mMa8v>>D-Ff!&!ffBo5F$SZDvJ)_5Hk!aeGZX-ZM*o^K{_@#h9!09KYvcwB2m{yt z{aZ5DNvYQDt2d}6-(b$P3tyPYfcj5hu?;9Qa`5H<&eMczkYt~`x~m8b@M8dG=Y)pULjDp1 zq4*)6frFoo!gPlg>}#-$>?;oRysfDTLD?={%`F&AJg-D4VOLPEZb7I!K=^>Mjh=V? z#IXyUE2)UL&%qbF;^!bsI3aR%$HfJ9!ndD!v>%$lf zfU#9Bj}_q4Ub77Nq>ig~09>OtfW(^Ns+Irf!>O;Uc_LdyktCM2L&c5M->Zrm|N7fIIKWiDuIsAh^7a~^{idY0qu{o!a2*+ zx_e{`27Y>E4n*7M3KUvy-|DTEm+Z;jQ?KZd7eKWxqpPj4C6?@!z4rFgVj=!)&rz8F zhxs3SMXT-`x4bVxDno;W)h83s5Jx(PVTE@1QMw>cJd(R*O% zqRen5)4XF$t)Yw0xZx*(4$&*=xJ%UwrQW|PjeEtFGl4f((P zk?awiG&8VgaA7ylBi>Q!)&}xQu-BB(JKvUM-gedsj(ULK!~v@GnV96Dw3UeRU5KCv0EzGbzEZ8THK0ynIih!RjI#XGBwZo!`oKn@&gA%{C@%l8eanO zs9s-RwEskOw0BE>bYE~?tjwIX&xxdhLhnI!z=oM}pF-6|uzV<4u{Kj&g2Gkt)A6HtMNwdouR2LRm!2Ky*ra12Un-VHODTGisxRTv8(f#@7tGeBjkJk z^Rx7q_}sMcBtq~|v5W_&@|k7zt?G6jqEiDFWUZr!94J>zXZKE;wTk4n$TBBi|<^MNQ)SWF^mpw;}Nm1 z8|%7dD5TIi%C|=K5D~fAq0nclNlq@$Sf>)^t$rtUl^0HF0YkX=S*-ylfC&7)f9Z3} z6z=!acDb;5pWlwko-KU)_kr4_;%0CiQrLF&t(0cf#f`k3k97Reko#O}XBYuXW%}5! z2x@P$P{54@o5@sn#{`?oT3mKVXw~Q}gQ&Jutxr)Wu!0PKx|%gMrc;Ex3{7Ey~Eg(7me*L(vhQLR@|FSRwIkIjhwZ5+p14FTGtkm z#@G0QT%b=iOG(X?b_^#h&OU5Y6W;D@R{i-|GQH3~Q1w8gRr<2(k+|@%7B=<6Ltjpb zPXUxpR%uI+*ChBu&2!d%STT8PGJIon$yy&+`ib#zOOIej^nv+M_XFCR*gaEM^v5ug zO<=Z~0a8>ZcvShZPmB<+Xai1;2!bUHMg#uzt~|@F2VlM^#wP*jH_PjSJb)6S&x_6O zV62rNj;b!?&U+0!sFaO|AJEmBf19?Q96YBU+eMCL|RU+w|Zz{n(dsTHDLRbcAE;`{=EMFTn%FlcT9Q z((S19*J?%0o+-6g@b4pRiowe?3xk#p=@+2D4g;deRZSmoy$vuA01G2B0fp&Bt@AVE|)?N9g2uqKdX3C1?Vg!xd8(wmSGUTuMX))VN0~gVXVSd%kt( zzrEbJT0OsvzI(Z$kUev9(b{D{hkF!44YIb_dXcHOPefSr9vSO!@|NWo@V~Gm0!tcr zw+dJA_>aWuf_IyQEGXG;rm?{g#vAFsO^ z)N7S>Og*QhrRGU*6?dPB^g#WIgF-3pdNdvBIny`4I}sP3iWgG_-R+J{@@P|^q|fxP z2|g2dZaRmV_UzaGmcfYVdZgIZ>@Is}+s8MA#l%ES^-CuD$n zj_j8^`UD#WO0*0>@*+^3jy^iTyBim`cE)c;_SJ1%Hl_p^&A#WEyt2kM`r@%`fW)cZ zX$6-6Ad9OBkFM`w1y=C=4bW0_%F~Wd9Z$c=+Vk(C>ssg1#0^w|M zV}*3DCo{@vCc12Cos>av3|ea)lphzKOPRBN!_UFaqxliek~?Mm6~vNe)#YXowI3pO zdLF5oxqQv5xO;t})msb1CeUcmUcK*{YxWS8sB>)zBx(mLI8Kgq7f(R~98Sp93fwG* z!X*d}v1RjiD;^z?ZAa*_3q9}-9z9`?V>fb4Bc7UVh12hT9dh-#&y%1DoTq{pUf7x_ z55>n8HYp4Us%L7?cqQI7S_sYifbw`)GeDKjO+hgOVWcLYgT})W={kwg-Jm$eY^>cQ z8V$PbR(#QMeZ?q`m)h-LAq+gR#Md6+=_1f5di}OW-{WR+Cl-$^Km)YWLus08{AO#n z1fl-{XjH4cEV(79}GSK?<5y*b!Zb z?Au;WaKJkI%oXP`7oK!&m9uW4%YC)qjuYm^Zz4pQu4f#BSH4f~h%>ibmExgvXL944 zRF(rFMU3`gJOgA*hlUgohKCOgdU~U;8+mzMf78wIS#T8!e;iDZrTPHxU^l*l5Wlbz z8r!K?$)B7XYooXyv*Ea3hzGq(Z7h1y3QTs ztg((s0-qaHA=%DzM8{F!Ko8lirz<7til(vJ!JG(Wq_B3C*f zhU0;#nsV||5)}^6Lqt{E#Tn3luJS6MDs$gi+KK{@q|?l!?zo&#j^u>zMI+lqfU+_z z=Xrf%AsHX=0dPERdJ49BI?gx{?mqw!?80xU&>K`Y3Sz$lD2R5Sff|+C<*VkTh~D&W zakzym>RAY#OLG!VnGhOUtJv&7H(v9WLaM(GH8W~>35t@ajAYjrqk{6Ui~VQ6Y9f~9 zrRF|}U1zd;WV-3ihPU7-g5Tu)IMl1ivhl!dvyqM6KRX`h`6_xTUN;SLd(PoxS!k&u zxA3M<4m(uVo1+Et;qa-;&i;uLtF(lXFfi?io^Qc z%%ir#0?P3^%k>1#&g9fj`bF|R`{s)jr7N$c_Puxe8k=X1HfnKC(<{u?rq-;*NApcye-n?2P9D$Uu^)S=>f zd$W@_?mkB)?(eoR+oK6=NdBB*vZ`Cy#!*tR7!2EGC+R-}c7_d0JKxVNXyYeC{cF|Xo zq)^qbINdDCdsJ&`r01?d93F0erH@9aGU6>=*-ldFc~2&$e&RHPxUj(Mg74sk z0r}4J9%%E@wKW|;=H3U8*5UbB`QcC=a6#f9wNaosf|v)fHqPEdGtD`DGYLnWZ>v<3 z-rx8&V)9CzQAt5fTK2{*vYxVYFbMiMVh%o43$8PsJ6{jKCXR(F_nK$wmg* z@+r=epfFn2S5g2EFOR7BbV}NTYx-s%4Va&v*=5&rzVg2_>ngz4e{SA=Q@B2`yO69oy)uW7PZr|YsPW-ppjhiNw&Vr{TE=3#xnY%0f4ukfqLgNRhcuEB zi)kn99GP5yQNV~ul!{ip%A;y%dgZgnT)MEAE5IgyLJeeBA#1}-4tU8)&`q}-O2bUM zD{7MoQMfsqm;u*-=O=VEHJJ=0K}9U!c}9V0(ap=TK-GgVn#?<%du{rtf}md31&Np!30aLZzy^^t&a zOL3+Zj;-AI?NlX)6#nqVidFt=8RZvGk$q<3w|_;Fo=(rNpfnkQQ``76HZsIA@64}7 zBS+RK7JCSh?3}#m>ppi~(lr0A9wR z(i%PZn4BnylpLCIY%7wV;wMboA9`Os;{=S)Wys+SZDX9|yho5bnl)3Nx%gEK z;f&+&T622sIYl5m`RkkNj9rA2^ZD+c2ulP%cDJbh_@v9f*G$xP8X(2U6p%av)i%AVj%qk0IrkX`F zm-3|^rz_Gc)YJL$JQd{Fu1X=_c6YVC${MJME2sSA)QIua@xWX7^@2tR1kx866eu6$ zL_rB57%D+D*mmqCDz&<(mcCR5-OYPzTJMmUK~aEIF1J9V#m< zn|L1nfN<0eoVEzPIg%f{&PNF~<#_eH+u_0B*5UNQ;F$;s8pDNSxN>BXOeO4)O%%X44k zevHCNw5u2L`Q^;-AS9eOp%Jn0QNYfs8Oex_OO$f7f96%MOPAoYfrbW$0?AYIz(*k-yKp#iM$5%1om?;QI; zf5wv&NE_wBxw=@=d#wp_CtuwlrHM;2wUuKmWebqg-Kn(_sr&edp;a_utV8ZJ<%TROY<#M)TYqeD$>hTcvEWGc2cC6apYm_T{fm59u$#z!hSt^SaOtM!r z&SrV&0^)s_|4);J$f~CjChTHg0O?KJNMqJ|_CEEPHwra{j5wu!UlW9XL)H&=aiTV4 z@mg@oAq8)iwfwn}nq-TPXl+d}JD^AvU4`?XOa+#IW_yPVG%;LP8Z2G4)`OU$ps5*t zc8;SpUJH0Sj!@TWn~)vQ0d?Jowcfn+t;@ZY2RxS}5o*i&dK`hE;UN$M-Jv$;BfR!# zvYpDIf5sl52htuUdN-8WzQEo81CL$LCJZaMECl{IYDk}Kt7vsuj&i0q9Q>ll9f8sWzpX0s^f2s zmrN@M${;pI84wPNQ<*_Q{uwk;db213x!!A-da?6C(U~YccHwX8f&*f4%k#=kIc8>KU@@;59*>Bdydz2eQphP*zg=;x>Lwe9E zk$4lRiRI2t88}s)JQMu=)Ph|y7{3!+Qy+xh;*F&1gBU8-+*&;9P#(O4G*ch|Y1 zg{JJS5@-cY7$j2pn)1Js93{G?${h;GfAO;zWEb^Z+qRD0+5ofRkkjP0H^8$~rC1JK zAV4bt`^okTlG4^t)9h*LQn};v0qoDy;^zDdyXK^hf2@}`K+1beS2b! zKf&GAs38=pnXA%zWG8b>=+J@;=&O~%ZpU%+>U_O*l$pLW{PjuDhS)YXvCi{_HH8I!7h2E6;5%YH*|jdWxL5T+v#CJ>F}ERPfh1pbnSJMT{2m|kp)1){`2 zFYv`nMr<3TzpkzOTF4G;6F77xV=@_}5m6`|c4>x8W33@xFK$tlM}MUjMDV zJQ(6p_ws0tiMhBA2xn)Gih@G=-uP?ELjTPqXiAuf>`n|-U`LFMewd{vWEW!*Hm1y- zGEF)xaUgX_Tv9nHd-=r^wKrudfBJQUv<=Y1_C`o7fYS8~Dl&!84lt_GE%0#|dHTaf zcr92{Fwf$GI70{_XUh*QU)^Q1F#slUJ4+5rAAKdzI9wgD81sTgiX8V)( z4vnJXHgFR&t%gs}9v>O}$S2Wa(|}bw2F(!s(l`3_vCg;3&llKR^%S4ac54XB3;6zc z*MA2Jvnz)aXkkf8Rh}P^?qCFKtoHEIgs{Kigs_L(@Io|cxZ>x>!HoeLx_X~nsR!>j zdx}e^zJr_2km}Nk3`JjQ{l!P&a4t5Y`)3F6o~F|%9~}4}v^(=4&*FRz2xSRi z%?E&Z$>szA_dyU%W#OXw2|z0Bf1BNP%Tg$A<+SFa}k0-lHDcT4UTs2(n0ZF=%`N;`hmA0FnLd;m!WFu9$ z9cbG!(j*X$MS@C-uWq-eSVFxx=x+<5IafyQy^a^$$;thuQXhOw z$ppDvh#wpE=cDUvP}%^si=>>IMW?9)VCSbm@h&0afhFXy0N;vg9A4&ZDn=?P07SfZ zrYr?ODtRe&zDfpQEa}Bp1bQNjUg09my$zWP059clpmld1KAQVJqoyGh4J@dn@u$=!Rn?Lj;sHNdTtKF7k(>ws zT}6=|1=>6`VuoUQ`Fu|E99+u%0Bb2uq5mA{(aH%5BpY(ilVQO^5SDsNYUH2^^AHi9 z?wWTNHIRps0Kk|wm9ZamaRSibIsZL1z`G%B_iO3;)}yrVFAR-Puk z=jxPZ2Us@nYP?rDSDqiUMFOI>9^uXgYi;PdK#!qJfMFwE{ZS6EA^jxp&7m^m>^K=< zr`trT2B0tV?$gNK7J-mD*04i>)RFRTmD*1=uv=xwq1z$A0U1(6;a>>_8FIBE;7F*4VNkLFnLeWD05NzD{W_^)C@rqw@enTE2?io-B|rlD zQ9gh7oYIK;7wAj--cudX>QaEg31t6CspkK|UHIqXuzGc7fzqx1h-i=J;z!6A+#qS<*`K z_DCQ81%dPa?+`fH$10hg#{)mpO*lY$_6r?%2MiOiX$@vASF#Zhe+62hf<~8szXA#R z-kb@HmHdbNglYjq6mvjCx(4Z{jCyOd*1p5hG4IW5OBFvfG$bQ3 zqJkCf-Otf^CbWC&hC79ooJJxdVW9>Dmz7STW>1Gwe-5Ll6ViopO3soh~FwQQ-FAKIXS)~tc9p&hDXzqJ^IEK*`s{ud>s z527f=r_&+Z1S@4*z1P47{I75_;(H&O148@u1CR~=hnS&Z#yyXRH%Hx_plw!YI|qbk zXeq*fO_=34T01TTs2N@~HD#%P8$xMcnLEAU!KN=5ra^mb3{8k};Iy*&Bdl0}5AFDi zpV2BV@c!>zK2vCw04niW3}iN~?;&P{s7RT682tYK;y%qmw;6!6Cn_>3A?Z6y?2gH` zh~EN4i@ScGCyQSv^AE)To8a^htATMaRknIF*Cr`g6XLQ+wgwg)a~UOzfz@#pXcxg2 zZ}kuP3U?jiT1cq;=+FVy!BT-p5YX)uOdwK>MF(x|krK}YL;?BPjF4`JJj)%r$rTJn zD6wKT)$wt+pJ@4no??ggkw3@kChS-3fNpjJ3;8d;X;th$1dO%5qBnkw##66zD^$~r zJXa;YG;PmV1LE*Ms#dktO|rq>$DbJ#`mdzu{V&4ut^)t7ukL^D<@MiTGygx5>Mnz| zbn2Gn&~a~7;)xzxzz$TLdw-ESUM|$RN&uW10^?C7xGA0NBDpMXsmWl+Ag2{JNLAgu z>>N&bb1b4Wq+8?#NXRf!eB4G1| zrp`q#;`;iyxn-ONOPkh5{b^o;NP=B^l5PS&^FaAHu5-N;YzQ>&_&4Hj;c;s2?(WCP zUIR!TFZ9Af6(#Mx^2_V%UkA$pkjrcYwJh$eh{zute5(^XTeLcSCMrF6XVW?O4jLYHB;YGY=Sl;xH- zGISH_T^1vA6)S!}8GId=JX`nn*J+xFSq2KDsTQBsWz2@lBuyi`$97*g#?AQc8GNC-+1Pldx}RlMAM<=C9+ajb3q_-xf?ueNZ^Tm5k z)$8AN9%|7|06bEAQjN1LtOlE91_TYq<&?>o78QcE!4n#xD(^ehwuuY;^ai=GsKE7{-_C5Z6D?|D0eSb&S)z->D5_lIho&!nX{*CKzWyd;$ z+rd46<|m|ff@$~hzc+udmE-wbmI%npf52W{vINGtwkCt3w)}Gsp&rzJ<9Iv#Uof;) zZYp{>-2kyrp>B`oY_2c}_;I+(4*y3;N6l)t4lsI1UH`|=9s27yh}4HZip5%V&xr-s z&b1OsoksTtDZ)P&UUW5)PRKDragjJTwn2pT>dZ9+>Ze9OJ zhU^Ay&K9 zH<0y%X)26kz0y-v+R5;a=k)hy&hCvAlwbZ*%;dDPFvZdNY3G#AiQKbiO6?h3yT5z$ zC#1QD(QsvP6xnYyHmIG$>2TuuYG5uae_mSH7y7)2*VJ2rs0gRv!=L;-sq#xaxk7$F z$~@^5IGrxGJ<(aWaedC_+2+2!D66H`R%QQ1o*=iiSS{K)!Qo}!|BJo1ev7h;{zgYp zQS=c}krGf4F=!AZ#~@T%N+gvMk&@0qP(eWuK|*2>loSx@9F^{pW&r6Px`vrK>*jgB z=e^E(uj^dr4|wMXW9E(>D?aP9*53O!5iLr}V6WJ6iR*Hs9L2<{>FR0yDQKbEU|!m& z=2kA7OX5`3YwIF!ifI7qw9=E!$>uf=H*vOUc_ca}q4yo26YSBN|2&^OClUH{)E` zC7)C&qQxmCF%lj&4hhe1&k-vG$wrhpYGLmUc7u1AhVQ51cd}KDh^vm=DKcKeGcM8& z6gm7hA8oJI&jIxHxw+x_`siz@c_yR+xGsTv*wxw2;f}oS=R%KO(Tr2i0N!p#h7@Yp;=+4%hE^r(xHv9fFwuWj%ezj;iZsy+`b)ef-AKg=lqYGvxi*X%6w8 zh<6mrKccb7COl82ig6sb#9V3M{}vSBib&$#wW9pjN(49(e-xyrx^*j$nmt6#7b7N^ zX~R#|fY=;JAy`q%%6gjY;>bM;ArfA{`0w)tOu@Sx(7kcn%68$@h(BMN2GP<`nmkc?AptfD!nV) z;g7hha{T10biAd-xiq6iJ>2_8QiMSV(3EUw1pA(*YY$fv$fQDwm)lI@)EFnH%-Wrj zwv^%)UabFu%hD)?yoMX@p7a`e7jSUT9$=)_DreLCXMK+lD7|Z$Gj3Err+U#5|X%X+3J*qoecltLFQbg z?1nzM7B~U#4nLWkcd3CtC{Q&%O2XS=DQsk}h{!z!2F`I`Ee5kQY%6KrwuEyMk^DO0 zVYz*GB6-#~n9rRu5=tPCm(5i?zp?CX>(cGRfRaF>{@x>=YpI4}a#UZ&%g!ijETG3E?1>opE0{e~{XgTpCbIIM};us%4hL)1}xqHW_*LxjVEU&L`g ztRtU<<)}fN7cE+vQi`6P)^{-a$V(t?>~vAo2%DkCH=RoG zURSI#2AX|~DX-8M(!Un}wJLlzw^5B4+^k8NE15Xl(W|hPDWY8(f$^mtd+(oU0}C{% zP~q9CEprHkL^ySM1j-CIws^FnzECQEx_H3E;ly`ksJ`wuyu=_=JYB4zCo{ORm8Y7l zKb%as7l;VWIp_0XDquXbV>e!5t0upqT~dn2e>e1}H?l`lrb2>n>#G#Wg&Cqst79{8 zEl62=T>^%(fH5N9=`(t0`zAI~0C#R@lA&gRo^XMUh=_U}rx~VC$Y!;>u~6<=AKs$z zIW{Cn%DP6URBRrPFXeG)ktCmLm8+Vnprl|uC+p=}wF-x9ia#QqeB3X*$>2o^#j+U} z(4g)j`y306@7WMPCbZ%a4Vu-XqoZLLYtkxJ0>F>Fe@;TixU8&(90duwC}hRcf-Lw%aCnKWI)cL&wR1T@8s*JXe&db_b9tRjh(=O#cdPc^GpE{BGu zNT1C7?I)(q{|@8}`&@bTWOQD~`eQd!vQdr2PU&17E!%b{& z$K5#DirTmbQQo@ulM~!}0AaqCXZ>jNu#Q7{PjY1^_x$Z9Zpq(S_-4H4Sl_QPThcBCRB`4r;tVOhp`ROI%&(vSqwKL$(K(#unzFtq=YkcQf648l8Hbu@1L%pGPEJ!DcG&^( z*^q0mE~qJ#vG2*X4q+93vUQDaIjOrA5hom}gc(BxA^?J7v8QT}GcR`xUme;`LDbb4 z)Fb7O>z+X_m_LH*Ko%~Yi#}TD70`S!&U5G#^%u*X9WbHG;2**TzJhLo#+;OSlpDh4 zwB!4G*LPObpCAMNTGh!V$ku!9>ZS-k?Y-!<`?`8g+u*NlJWbX+E;kP=&k?2;ZG$L1 zl-V>wM#j!3i_tuM_xA7Jvsr^%@O&dHJ(EY?zgjU%X;!#m&Zh^AHxco=Gr4HZKFZEk zG+$u#DU$3cvWTxe#_Zj!+EU6#Y1>C_6lOZ#t-|X3+h~A|IP1?(HuxY|vE80M=+J)m z6yZt}|CSrWa3y|EAM{#dR=Px#tqWtY@f19}7vj}*J{fp+bYFjl^UEgIyY+Z|P2x@3 zmmsW*>=A0LvoiW)n`qPQCAa3?+kon`qz(`Vv8Zlv(rs+acWNM5!K5lS$0 ziHpH~{k$Qn#Bq?LExX1klSBXDCR3&`n+o`1u{h!+nfww^SUIMB_N#y18mD-IE3VN0 zpX*!A8vb!hrY^qW1%y*MZf@<}-GACQ`^p9{KYY699At!6@|dL8c~Bclz!dpN|J>3; zQ*a;iwIanZWki-S$~w+@e4Y-~i{JBUEZZyDlvOJYYz1MhyEYlYl$jR3P%*A~R0Cx^ zfaEEhPpMWc&yOH7H@wC`zW&*)bOZr%M*w_c=HlKg#=8#I5+GxZ`ks-C?At144-jy8 z$+=q6$ch~GgGIGb6~w`tJWY=)aL(6vl8Q!H)$jkw$*NzRP>jRIR}!JuoJ_+y|Sr_7_{iDQz&O@q?`CA>lDyJ2JEP|+&#M>PHh4E;u_bU z0FS=*eTe#)igeo_Q@#7B-ezPH+#d|nam%X^e)D2um{}4hLygQ!cwkTM8Dt&i|F&(v zJ3Pd(GKfhy_|ny@EJN+vMx&hGbn=SC zd~e-CsS6Y^a2Ge6f!~ zZNO*0q@o1Xa9&+DdaW+YRxM52pdZzZkQV&u%mv2F=XenNI+;<>RIMRrrORrxq87Wj zR-xF+%u;>upq@-?JmsO3Frn*sHoKssostx3oKetGZC>la3C`EYgW*Asy|kQ~;x``- zQnosCJl8+*%}Em~xfSYt$d_v$E=td&az6g;uNoYB`%pDBWh~k)y9QokvQefpMiwOq zKb{_mdS1V+C&T~Xht#6S?o8B1GkB~bDV zruTm@MI`kG^~RO?Q#zQ*Z{a4JXtHDEi-vGTzB}8V6Co+dM+%|b~r zcol5P3k@2bZH=WD8(Os+bR|^A$+(O%nLJ*rtJDZ?hR*7{cSD*o%tRv?>-dL`E=T-j z@s6-X^|+)rtMFu5@9xk+y%q)8ptBN1cvAJNgZ8U9<+tlvEGGS8xWTN5uwV%eK546G zgRZJm*Qn;xkvQ1r_~7PNr@DH+;KYLgrmNQZ$2pK8&3CSqKgN1C*M_75=LW=OpdGXa zcWuK?;wtlOy^9SPJGvA0k;ZD{f(-N@wEZo1d~`4H4ZDoEeL1R>2FdC;CYp822h!c9 zPT{e(wAxKI(byZqh39eEUX$PQZpz7Z)kH~C@-G>c7vPImWjvqv|4bH+dv!h&zg!v} z7jN|4)J=bvgst)AYV7PrMyzIjJMC9HytS0%=>z3BuxM4nzLsE~FX=p^%i6m2gvHhI zkK{y#n5yy-xM^rpqa=ZNQOSpO(hiFS@#Fm+G>Q zsVGtATX;a2T8bV&Up<*To0^&)Ok~4Iv~1VVLhO!<=`&mHjp&a^i5Ax=a9#cOaU!F{ zCidum0>#kJ=g%aPK#54s$$ZBi$Lt)QZ8$Kq@yjeQvYKuRQ`~v6aK=3Ng6@U}%x4QZ zggyBmrG_*7fesbRl|_`Clu~I!3ErFcT%0-f_ZlE(E8*zo*CS{7lsl*Mt=sH}$G#c_ z(<+Bf>!*SXQYWLB|b zw2T&eHVLFl{y0;(^JHT|RGhj0c~k*XX@M&FTTvUYazyFv(W8$+)@}O^PIFrCk08bn zKRQk#H)`;e8<#G=>{Qp#M-@QTC5-JqAk9T)Incp>+1^WhJOSl2NAEjtUIxMHj6NS3e@*nF_tv;`Ed!4( zTWgcYweG^`zUS))EzEYh_C47Wn)G$wiT|dc;A_rm2qHngYFh2Y$Bfb@oVe)aw2I8*uhRf@4IsVVLN9aj=NB zZk9C%jHgVI?f)-`n6w0{48DJ~cs1DV~wehqW0Q{$3iF zT^n=6Nb_PXn%r^*nxBAe9GjAa>}LUdy#>qE+b%-b{3K+a&558SeP#nVv`7hz0M>_-@=d z6W%eLfPivw}0UwxR0O5HA~ISknOSDeGn63-1;5XL;YNoP1s-h z=AfC`K^D6X7A=up}x2e|RsS2YMFB?5!cCWSX1fxxSJyx3H`$0RuDz};9F&Bc_9j8`h(;zq7 zQl`R%Yu|ZDcMc2w>~3gisdqf`lP!H6gK=M<&A;5ajttgszDJ9L#UxISr%#`{<91gX zYs*$!BuRQL<_{)c%_9;iysO(%uKyL4&k;6Vn(ckIlt6NhI6GzSHJUmN0LM>KF?_doO-}RO!CK@DK2i zFAiKW9n7=rRKdWeV8fJ1-0p-QY!KJtL+dqt`(S>4KBpcv7yNa3GmkVFrk#~Xyrr|V zu|V_>yl5b`UZtKiDJv^=o%G@j&TZ*e_rCfxT$HnpoUBy%A{o286>(M1fH>Ya6}vj} zj~L$7YR-x+N`|y$q!cG=`SIHhAX@% zcCB7GnQZ0UUOoOO*P$1!(!}|QzN_@~^wmcj8ff}OKgJ1rZ0tj1K3u*Kc7h2AaNt&o zVxe=uNt!iGVFVYluA9vF>h7dIi4_OPbG? z*(`3wTW`>;1-Y{Yu1R6MxP!c0j2SvfKE#{&-tV}L*YqDNoOAQLNTZ!I0>V}3glNM-VUlbCpe+%TVaP6*UFp{!viV}onT}Pd{%Bhd^I)`atN7+%akO`~v^p~SrHdt2O~~eWsz#p6)wnoO>wc%` ztRv0`Jy<&z{^>-y|4CQC(6{$8QnNa?{U9#R6}_N%Z7!6QPa9hR;uvT~2FC91D+Y49 zn@*UYumP=Phz9{j=;sWPdVVu+7(ev~I4)1hoDOE8=Js)kt?c2L=lCgM148^$e2G)- zAX2-)irHQ6RD_R^0brPUr2|)L)Xq2emadJ$FkrvSQBC_1G-r_L@_pH+oGhj)70?PLvC*F)v}P%WIH> zGOGiqbAAV|tOe-in*S4X-F_5vSi||Bqi1*+)Ah@I1Ju}TOmVjqSyk<`mw&`O+bL(u;*&?b@J9&N^6{?%dO()s0RMb_liH&gue}F|!MmW)FDHdX=CP~ps&*A@Vr6J(C}goQPu1VDKl*xTAh{BR*@?Zi$Wf4jAT(j1NLvFgc&A7KGj6-_rN6ubCklB&#q-_H z)I~nbax>mp8^f(tH{OqEpMQehUgp&)kiz`+J(%J>n#9gRUeCaZdd(LJTd=u~yqKs| z2uk&KZi%I0izWXorj=Q`F(<>S8Z5oqq|4umjZEvXOsc5x?q!7yGiS0fq&bFGG-X#D zU}-_&@>nen*(ud0&i}@%OlL$!b~g%fUN?K2EmG_jUmkIw6`vjnH#y3{!{}XilbW)Q zV4QE4`1y-Bta?r>9I6bT>=UfEK91cPp`wjN{s0H!23OXCV@LOQ_aWek#inGrTd7OM zeb0dD4exZjOtoSuGohRcu)7!=;CuBowMQA@5fKrCu4C_Xdb+#WfQ8)hze--n5L1}# zu1t}yN`p2O0n8d5YsO-g3gwo;-9FniyRm1+bZ`}$Wi17FY}S*i!@P(aq)O>c(oPxc zu2YAyZN>H^Q>2miHstt^!@ELZ{6AtSn;1&=zX~gHXT9rcwr6T_=v)}9{>nI$No|X} z8@Wt1Pg)&vgmwy>c{}Z_afaxpYtw~B@o2&+o!tQ)+rj` z@!mT#g8wRX5AcN!-5j^*`0nkId`#e^M>@KRGVP@hU%M2>(c*XS4GVYqvWeLH&4?5Mn~Y!I{b(~TFPZ&NB!0%D!RW~w_YAeQ-vD}aL1;DxVWzI<`q zUYS6)uZ8-PxDKfE!C*gr?8+;J*z!r6#@DZ3xdzC}H`5VGp?pA2JME+d+t#uxdhc{% zy`?&bg+Y-Ho2S0NV`*twe>o*~MRR^jK2MSQ7~}Vyw%fE`Q(>T0O-ivY$pz8!P$pP_Gy zYighM&hS1A_W&P`E8ukAD4tFXyr>!@x=^Bo6P~w0-U`6a*P{{q`7?rr*Gd?+Z2hl4 zJNaqyFq88|ZrJWOTgZUwh6N%Gtu~D$7mD?E6s~N2(Z3_EbTy1XG)x|*b^mnZl}$9b zv6--o%I=;XXp5=MDwQ_QWLrAG;C;#ES67TO;mw0*c ziqdg+4LCFzGDSggpmJ;{DHU|LRN2lYg%?TySBErs1%1`{hDND%SFknlAT5FkcsybQvAc`#DAQ+CxXXJmlzl(_ zld10{|5_at%BE7}z7uqGg{1Z5sKU`|_+RM-jNGlr$(FbQ9AtA%CDGbB|_tEV$M?eLRLLyL`?ilPt8GC@2 z2oK(2V8Q~3k@w#vI;K)wNM)Vt(U`n6QbgH7gmNq=)9d6IbMSJV?(iwsr8?wd? z3f&ugU{;fYHDg40XUzzZ(O*VlhhJieslxBQe2Uv0_9YtOprs_Fm`iu zTL%XQ>LUjMcR|cGtL`=AM8Lg~MfKWDwqnuG$-CQfupJ$Q(x2F_IHu*~Kye?cH1(@u zq<0yRATF|+<|Su(wfBgYw3K+}&h~8tP)`55k@0?diG1gUey>8W)5aEBIyzGn>T@sH zcSk6NE|f>=-P`2a;T^7<-s9gB&OAQWz*nS9bB|3ict4*7K6v3qX@r5*!GeLwko#7- zyaVt13&Qg6##jDsZ=cM`r$5&ex51?)wgGBrvi+eI7(Uz^xiZ3z0AP;~EmByUD_ zeE2dI%JJ06lh5x%8-VVoJq=b8zeOa9Wwan|nSOEWV1pNE;2~%W@@eyhFXFD-`zGl@ z1E8@quV})ptgY8I;`T8HJ$(3Z7lK<I8o4E*koxiFacv{>NF}Ro1V+`1J&!<*>NN z#_VfP2V~70aKxp#QTlU>xpK58hK=NH)H{lluhWc;ax)qxaCv?Nd?i$kD zYuB#H3_aMOGB$qu=1or8|J}%PK?NshO&AV`$(=n0m;10?0&=*-(#B>(2Qkm6HmX-N z$a7FW@<=C<53@Q{OP-@YaiVR0ZZ1Ot8G$zbf4^g9ZU~)*|KDI&PrXrWl~p_$G<)o) zv55U>mEZqN(<>VHB(>HU1xN&>vYitM=vyljue}IjYMhV=O$7vJ?T$8?{gX!23^~OZ zG&(l+@8Xp;x>q!V>8YuQ^9B~=%NNm((`s+tylG$O9!Hfhu(2(+t&+{m%)+!n=#U0M zvj?AAS}sd^?d09fCepG>yITis7T}=~@tc(P5brewPzDRZmav4~uP?J#N+>8J~FB&%S&Bl@vot z^s^q6FJcz*dmcve>P2L_$D&$1MqW^R6Odtg1Lx$fsozpjQTg%TjkiAgHl99xT8W2; zCpKwHxTB+^KvqsJKaK6G&nkp9%Z9SDFAcJZg#`sQQo#1KX0UO?`$jIGzA|kJVy`|k zGh-Vs6h<8@$_1ngj}=6h;j}N=kmqjwl;w)=g&V%`tOA|KKmYpvi1)M z!=miYw+^guy{h*L@#V_{{++*Dr62Ry}7>W~cUz^6+4p&IuTXzMC z_qhsMRNvSbbB3Ni->W@QE_=oxIfxzYT}wqPwItbPa;PmIK@)E6_7QIb7+IK^las%G zwXSb!QYu7@Q7xiIYp+6PQAs>nJ^5UTx$JY@bKPt<(E{!Nd&yu^l zyBYbb^3Y2^sc;sA(<8I_#=m?M-Bu?j%884m=$=*a6jiJ!;s9BoPxnA9pV`~nPY4PM ziefO7$std#J9h$vxwzz7`$XWaK6X1ep zm5&Fo2Cd%ZKw2H}uMADt%sqQt!MF179YDPP$=OE_A5N{Suh%VsR&bcHpN2;sx)!l} z?1VxH8>`2)$jchZO52?pa1uc5>{*oi8FqF;6LjyiS0q_Kv@=CDCOF_&KNf}J8@eld zQ}e9&b3eEYg<>qbqxL zU(u|YV}P*0{4%7zK?(c#@gvZsE)z%gezs$xp`WfNuXU`tMCmgHRtrT&Bi8GN#px?c*ZqDOdyTylTTD^fcPJPD z4)+;;!`)A6>2Lvx18%2{5i)DQK6h{+j9t<&9?>mzUQh&&Gkd$DqC&Jw7rCtKlIDmY z6BAQ3cw(2B=1ITwTvS`_2QEdLVXsYmDJ*@E-;jsq{7jJlygwvL#;ZhXdpryWQNdG< zxPv|wP3Eh=kaY~=UE;{&KEmVZfS@@AdYLUjjUoUjR94$Fz-n)99b~hgZac4AWcPQ6 z{;D?5eRzQPl`B{FgFJXszzgtd#{pH2+PiY>dGTX}jH!h;T!lt0l`RYuD!}4`VWKg- z0DXooX;um@_PGZmAu`sTsfSZd+kcD)VvN75Q>~Pw%D={G$kKpwr?0e8f29%dfe`R@ zYrHhk@Ad1~Y4Ak&=p2P+`D-{F$q~yZcTI_>pU$ zKY!-N8@K@K%KGtj0arO1}a_MNqI2|)Yv<#VQxi8`3m)z#JY3SOc- z1eTa5ql-CXRpv-dY-}tMA`_Pj7cN}a#2BJVBH^XvCgIlU3fs58jK8-;37Y8KyVtNW zJ^e(_&hC4ou|N?xaQCi_EW`4L>k2STJ~xAInVk@dR#N8ZxWJ)8Cw_^e7=*49geT1c z+}dd_36f_2U%Lz63`(7I0;E3g2&3Ba!O$)dJf|c>0)ulkJUu<1dN290?^=TK%jSZ% z(9qCm(y|~U93-uIj)P-$vYA+voP5!2cWe0?$K#$k@fcC-lH7y@<3z-0K2%X@@kNXk zc%-p0jn#3$y^)}H2XecRXJFM+6T&`y`h<)fzM@_nyh#z#bFGk5KPMnE+lwoU|8+I~ zk^Dv#ugk;XK7kWzsUt$*g$$kvFn)8#Q-C;+H_v%+3+}sN;juAatkqf`jwA|nvJJ}t z3L$tzH9_r{8KOOTd3n6zd;)0JI z25-{gd+_M<{rmS{Su4TZy!k?RopGP1HFVqMtfVu~&1_LtaI{*%cN^#`%Y)bOfj10tP z(&_F&aEDI{Op4ycu(Z3amOGi5nSy$SHlzv|B|rR7xrc4{DjBXNDZyLeV$~$TKke=9 zUG75|Q1!x8Hco+F5a<~R4XB6&>|FU|}1)UFAO|-kgr}y*YE5A#{vQ8}oe*b2>&| zUkAvS_wC#F9O>E~K1sk0R5;<$vg3lF=e7`dg`1c>yuljNe8ypMP}~69-qA6d0+aA> z4}WG(PMTxl*y$$ht^;tDlYwNBZTcR(T~j!HMQt&+Ri^Z&(;?&UJ6Os#*7jfIyZ2Li zW~Oqiibd+|*`LNvlTFc&{==tqa9p0w=MNuFCUrf1=Txl!I?yQO-Mf4sa(c%`NA-tV zGz|@Hu0J0SXn&etW4K$={v?m(oxI26$B71A7Fzf3e{fy4I^_3@o`uCq>mxFEKYuEG z-ak?HT~!hHJ|-sSfts3eOKYpD^1XW}=>(k}(xd^6#((-$GC>@skh{Npxln2t4OxkQ zY|GcX%eS|;E#bf;(UL|H)4CIPm6e;tVww*EcHnaN)^3dC#}vU0_0W=%?Qiq9Umpvt zs;WAf+o%8iS>F#&X<=k?8TFy0aUwkysa35l0^<%@s^KORdgLhL^jh8Bxl9VLUm*V4lH7?Q#PEJknH`dklo?v3q zjfLK%DkQ^lxIb_!f8oQ-G-*$5c{)MJsD1%0_f)pCv)fMB%IH|%`UVX_9g{((WVIEI zQ*k<^H99(4U}|dmY+CM7JFCgvyLU(5pTGT7TI-Cdg+=wqL8wioa=FJ%h%&0lLBx@}G94uSqTz znFKaYtkhYU6%L%@R*l!McvShi>S`Hs4v70BybRHrQm{NWTv1TCfnWmFM74co-^$7c z-@^#bfA$@DT(i5o`==t}3Mh$Hma+D|!f1Uyoq1qPEnq_H)K@UTJgOiXJTJh^q!7B*GN&pr%3pT8E7U?|BAw5 zu_cga)#NqGgXuN{SxFn+9=!!in&3rX??e{Xe?X&U;~K(`@4cevLmv_=Z$0dKBaizi zSHM2JB0nRkO0R=1%^v#5#o^FD%%>SgyCkBr{o*+|PY9*UxGrC|ynXlXo10U$jOTCQ*WGnb++{eE{FRP+J{&Em;i04yMRc(Dbr1+V{zt3<#)td8i?|SYwjBowcPvh!b&+oK^D~0+4rR_^?}WWZ+Rc zI+L3>Z+=&;HmBPG5%IM|vcv!vIhK){AV7NnEVT;MV3&Vt)`Ad6+ zt-g>oP%G}AHkMa3BL~wfbZxOS?U}y5`?-n-z1R?g;Wat|&k6z>=dThy7VP#7%*Noi zBbd|&;jYBrTLJ5jZhGTR(Oph=BJZSe4CEpMgWkPa&B;Q0{iNp9fwdc-_q)D$8-0$o zM3VWz-x<-GjgBvhH0CR?g`GI@Y1E_H`G34ly=BY&_Rh&qZ$}fwRKOe z*Z;6M%OhlpfL#-_*ovLa(`V+@iB$trs%KYggLSX2tk0l-`uxhN_hIN=a88;%MhZ~x zau^7VKiS3ZBX;oML9f2HwgiNWVD#~hUuu@3tIv>VpVrZwQ(`+cu9GcsYUC~$pN866 zh1ay~E$vMw=+>*lz;Wnm}AcE&Y+cmNb zA65oq2>b!HP)3);U8mPW%t5bejE!^qu6whheD>LZOy6q%duj}BDFe)I!z%6W_$?j* zA|Kzse^1epZh6sUEZ~_Y0aOR-Q1Whis|ljxm`5(nW3;vzp4E9-<0Rx;{t9zje42W? z!+zi;ZWba)KRTVjBy7>bqrUcp*c2@sSnAHn!SS1q!O;eB)>_V8Pw&D3q$!qC3 z`MhefrsOKYmBuemj+~w33dCr zJ@u_AAq;e(9K?EE4vw@_S%o$JObu{_fr(<&Y?BW+*n7kG5fSFBf{$M|!r8o=nkspb zhn?xM3ND1SE7%{oxw+*eB~VT>&-_E2JKg28LgRBiA2@Hz$$l}Jd>c^L{D_mx$MGYZ z9EeemV_LdEL+{ww+KOOhQ~%ksRMkJs8V%|A84g0_NA%)QYfDSZ#`AU)qY}rNtBZVh z6cuybJtmc0JfKz`mEfUKdqC{K@m0$o;xSc|Qs`XiI@n)tj=43@AVcD60yXk%Q=w~k zNUKrQ;!F}A#Q1l%S?N7+x;&fO7wK{e(XerKaK$ztD*@4p;a|P`VuE+(s%`N(L>RZ zhX(rbh?kE~rDNrqUcs}wEHE`%wVwwFrG$#Y2!5mZ;Q{_jmoL}1xm^zS0{q35p`F># zwaj!Pw5?5z22x3fu)!wbaGA9-lpI7zqv_IzFBc z$tIR_JYHK`Sg`8;rkl0CHC$A5y(fZKPY6Du5Dhcr#J7l8c3iMhB^EoHK_ZU{@J0>9 zkH1I$wl>HhzrYu-{cC}P5J=?-3VFcKF<0t=QF?FWHT!ZdGcDb3zXU=OA~hoxz!G99 zyD8Vt2HVlur?O>;O#lkz(<_+&1N#;H(3H|R-KzkTcm?VF!(>N9JR%|^xgo^17Ds@i zM)4vt2s`(a17i8zWiRlFiaIyGprWqQh;A}>@b>nWAtndZ0Xm*Y*%SsM66N#4YWYtc zF0}=f`uhWW(bvrnScj>WySlo4rd?y2NZ!%|LK#asIy$cBW{N)Zh^eqI*0=ZK)sU#F zt}X@$^`Y-s3$8G7 zd3l*ZLCN}m0SFL{;Sw&3v8Us)90FC+0uGXzv+y7E5Z6OR#ga!lK^_i$h= zrIb(L93UB9@CrhV$Fw6G=`+Smf{;*~heI`$jnFlHZ&Qv*Y3p0wWgedSE+g-(Dxd&E z(h>Hx&ERv)=H%KabAl1V$3qBtnTxCX2ZW0cq_JUuIUj0htOlm0rv7pX7E1WppKmoK zwbpj~g*%%2;>DU|cYyjg(DZ>PZ3&Czh&!X0Y<^Bo7PejmJ5F!MapDlO=vTQA8vrYK zcXSdGC?jEbD%(|o2W8})IW&Fi+1m@j(xRf>>3h$5hhRdnC~mWWJC9y0M2X{OS05mF zI8p6p2!!bOCK`g<7CwTute@{PIe5qq2t7-8_DCKMAn{Hg_2Arx5-3t;Wcu^rG}A*K z4hQFc)r%P|R7gba9{%CO{V~X45HGVH_viEkGEy~zjbETm8p>+;-;#4RE{Ka4KUjYN zizVE9juSCPZv#XM3f`&39Yn0&+C9h!R4}lactZ8tdyQ_c+vf(8|rj)eK-uJtlrcemzVeF zDq4>EkRKFM3`1#C2)P8txQ3;*bh(D*>K^fG4+tdRkQ6^Nad!0!k)cN68}^zFWHQ|L z1~nTY=-)A4OaG-9A`B_#mhE4(-b+)U=#E_`)H}I|rsnqTHZ**^igkeaYsh`kx)8td zVm5~g<&#nQNATA_XQP8&1l&4UzCl0=XeXMdu8D{!k-D};Tp{}U`o4kEy%*`|qa#nvfKCGT$SVbg1~bv{ zu~iE?wCwy^JxTsrRc)=L76ypQ8)#9B&Q?tel;)~BD~hCW@7}$4^`<)tW@Z!DZP5c0 z&#M=Eb2yz5`t&$0Eq|Ptt)bRCZTPf}QCHI%HT@Hb24PmeJAe)!9CIe=i z7e3w)I_}CNmKr$0$cPBhKA`GS&kp<1$`r|5y{c%2FjO}HBejkHqGsP+xJ@fuKhxPK7SfB`=Knvz)-V$+L3q*2TyWx zu<81Cc6TScKZrZVDzUaCy*aolx|hH^Btb*}1SIprEl6H7Kz>I{VWvuBSQ8RDy&-6F zk^H)=fPgr7GZkOzb8-9dH!wX4vGD+lYYq3 zS_U?)^p!t!NA1at$W0n%rgNcTm1Ylqem^R{Kx8!-qDCE}qM_FhMas)Ux`F;(KvI7Q z=Pp51#q4wr|M)}w^TsJ@lUSu)8^@HMr<;c-zSOU)BYD+3#Xc z@^wzBh9-pW!w+Rw%ofdimwv0|u-MtXd{Xq0W=c`~nl!VhCw@2qE4 zKJL?RPdo8uyT|?{rdBx;Z?z+>!ADe*I6G)wIxm9~V1%_IIv1{7N#_N4u>DSK=s~K) zQB?L1e#5e@Ybxve1Wq_<`NqzC<~bZAu=As5HY3<*<5M<8AiCW1b(5h?rFC?NeJde5 zwrir?;_W>xzcoWo^hMppesFeb>@X7KH@UR^S?6iy~va-#ZjTjDQ0;4 z*LZ?|vOHPd>C6pCXY1!c_N{g<8&_cm{IaB`#l8{x@XM$}c{Qg}CdMt|EMCEsl=2>{ z(HFY?rQNNhF`6qkvh{tPV#HRFLxs<_PP3U@2UVxL6aMSi{Jo|iduhW{%yfep&ks>m zDko)I&u)E{IGS)$(R+mM$dMz#pb|ldN_c_(o&}>e{aWbY&jdR=J*SlWzk4uIvNisJ z);)juvT=@Q?k;#MMD{EU?7oTOzrw~Gm-5vacYBs%uRoi?eBlDegOiKVJmL7nY8I;R-f;j!Y<_XW!T^AdG$7%^1X3GAdOPH%tz2Oa?$*XlfO8=|3yFV^nL&ccn!H`;oSLzQlBIYl?4a98Pr{uVVi$u1(oKaL+vz_BrIP>;Sws%5_fXwKy+&F)_FjYKYPgf{T z(HsB+j#=q72`RtsINesM{#Uw} z0Z~Aj4OP}qkfEJO$zLU(Zx^mHD6m}kTFW*dGrMhZ4IV|U z)S2ecX-J0EKtT7VP8CUZ73qI} zNSpcuBE;3V!n51iyR&1}94kL&rF>EVj9T?$8se^U6rTy?%1ceXRQ&B5Ps`(+Ds~CS zY031}eSYt~pzJEbvNM_FwK86B24zXT;KsYO)zlIs7*P7xp0PXeY0AoSqy&`xBn|gt)w| z^Wa!9M6yIVCbd}uP)Cs|8pAsYMPwv%GqbPt;OWy_jvTuH{JjL6?{0vkL?GE-@a7aZ zTM*p;kmpQSJ8C4o%XM@&r-p%lE_w_9M7*fkx$ikvxY?pqT3dH~_kNV3YIX6J6@T23 z#KfqrACW0!6zd>s%rghk>fTu8_6T>KC?59799 zv)F8X8{0&3yBrK;*8M0~AM2h+(K48~$4Z}c_D(a6Nmy%N9VL&)w*`zeW)(go%Wha# z?7%*Nc>={zh~2E|!mGD2I`ZwZ%3K!8b5-}ODmlXGx3jj%KJrxVv#<9=V&Yf%YzwvI z8~h8z!_uE28Goa+pQIqkN_$pZyiB8A&8W++k0&h9Kmr(-FD=bV!EbVtv}lVvACyX( z7iYvasWIupY!BM+S9=vumM(EA0YCykmjr6+LM@jontXBje{G!ax$wzc+}Ev z?!B9wR2R%^iW)cmulBAqs;MLk2N!4-mq^5Ri8!b%BB+Rf5dkHL<3dn^2r2 z^={o;b?d(GyHzh(66PDTOqqJ&bJ5W{n_p`Sp0Wd>ga8`gv3>fYHAxg&Nd@@jsZOzg z=u%yk)o2~%PdaMV=I3b}5E<4Rea=24DyuPPLg{(_+qQ;(3c)tVE~TyHYnn+cZ0#;H zyT6rBdwh%WI5;%lw9qC$5q5$Dq4Bn-=5==gSFy(cGm;3E?v}2z?{|o{PzdzO@rptB2 z;iqRCUOiFKzo@m|v$Lpl_?0p~B4$sPf5qj`N%j6y3GaS*cV&I zRwO~{M9?iYTNB?!3rGb*%)k7|A??_V^}~zHx#}00k6!v%Y30oJ6z)B-ebg&j5x#?- z8h&Tp#HJfdB?X*YanwiR7IYh7BZWGc_L}*HkBHK{k}!^C&*PYO*gP*CA)f-?v((+-{pJ z#bf>M&92*2Z$sF6th0Hxrb;EWNim~X=W?gM_(F@hJ8HHH8cL*1ro(aG{u|#LX&X2E z>Kt=6<5CyA&f40UJ~x!75>~a50-X1>^NjHYnN3R&;s9oCIUK5F;%jGl_|um#g?CZv zTvte6ZHwL{=!APa_}qqBVxN>4HPB+fWDu6F9_L50`Wj5}c&8torH8!L6gRO?oc>5) zPVg3S;>0NM@haphgCdw3Ry9{<2c4S~fin2X_Y55YuGyk0L6D$p1iZToXRvSWI-vq7 z!{g&4Wuw*z1V16y?tj{p-nmsFXh)KmdFI4|1gNLP8f9g7(+3K|xOp88)l)gU0s5}U zCn&q!rF1vTR>h@}wjrb*N#ZIA*B4K7gc`eNR+*VAiRp%5L^Q5NNxFCA>st*ox!t!U zmpljp84A4Lk6Uo|i4jfV_yrtpdBjA$b{1SZ>hU$9P$@jrsx`O#ty;jhU6xG z{~NZg-cbbgr;DH)bZ;U>j1hccBevQ25D!H$~vYDx{6E0V;?eyzGGhmvRC<0EG3Vz+;z0q;gi_b$uJrNM`6{q81Z@AtA{ zUquz7@JC-EZ?7yHQJ9c>I83J9XC%3xd>(v8S6h@GWfa6!mYH~&8c59dRkg&0HCICS z%AE?+uZR38Qo-<0YM7*pDKJ{|Wu}SE-6S{9y1}mC^`}ak)jeMNZxIZhDI80@KMMTU z&47u$t^NE8v zA_TBzmGWLxxjTUgQU`7a4!WqTge;Zxn~rJMRyU7dIjSko!A2c!uJu~PJ~iKuCTj5{ z#fM3~#lX&<#jAu#O6H@q*FJQu&s4Jr5yx)ewLduz97+~2GzrRVk{Ag3T`|>5C*~dkpX3I_dQGrix_5FfKg922EA*x! z;+UIFt(t3eRTQ{78=_BILGZj+r7+&(U9HS{qHkISQ1K;TDD7Jf>qOLrZS&vo5#{EH z5?L(@DUIq0UXjc-%=R%sseVSeF}eTR;^{f{dR-?^&x1YBLvx&eb|vbN4883Cm`nFK zyGpJ-oqm^G)mw~`>>nUx5+Z7*W$S0%;cZU*;){U=_&77FuSe`Rewi7vqOrE^@uvX; z89SMx_t|7k`C%_D1Im1CYXq{!S>d=NB!vtHHQte@TF7Drtst(La;o>@G^M)UU@V%R z@k@eG@q0LIM26RaKpDkgAchq}2KUv!-e^(CQX_FX_V*j6BU9+TZ)$)?S3psGK8Wz1 zUVQpewIg6p+MIoAVjj&MJPNEL-~_f;c8 z-M361eGB4>7BSJ0PcTmBwfSRE;0!j3;v^0iIv-K~yO19h%O7yO0EuwAyYq*aW|DWR z^fYWv)^@rETT*{ph)S(dAKacST`%-_2g6NR#^bLwUYsL`ms%;K^Pj`=f7aknsri@C zpL6G5cKZMABnsH#%!|U$6b@{AV6Vhx*{DKCft1fcya3x#Vu2l&i6+JxbU-1FEnT(i z@|D5waNreul7n7uoG-hHUzG MBNKA&ew!=*297huFaQ7m literal 0 HcmV?d00001 diff --git a/scBERT/finetune.py b/scBERT/finetune.py new file mode 100644 index 000000000..16820d156 --- /dev/null +++ b/scBERT/finetune.py @@ -0,0 +1,245 @@ +import argparse +import numpy as np +from dataset_finetune import load_data +from performer import PerformerLM +from mindspore.nn import Adam, CrossEntropyLoss +from tqdm import tqdm +from mindspore import ops, save_checkpoint, Tensor +import math +from functools import reduce +import mindspore as ms +from mindspore import value_and_grad, ParallelMode, nn +from mindspore.communication import init +from mindspore import Profiler +import pickle as pkl +from sklearn.metrics import accuracy_score +import os + +# 微调中新的输出层 +class Identity(nn.Cell): + def __init__(self, dropout = 0.1, h_dim = 100, out_dim = 10): + super(Identity, self).__init__() + self.conv1 = nn.Conv2d(1, 1, (1,200), pad_mode='valid', padding=0, has_bias=False) + self.act = nn.ReLU() + self.fc1 = nn.Dense(in_channels=SEQ_LEN, out_channels=512, has_bias=True) + self.act1 = nn.ReLU() + self.dropout1 = nn.Dropout(dropout) + self.fc2 = nn.Dense(in_channels=512, out_channels=h_dim, has_bias=True) + self.act2 = nn.ReLU() + self.dropout2 = nn.Dropout(dropout) + self.fc3 = nn.Dense(in_channels=h_dim, out_channels=out_dim, has_bias=True) + + def construct(self, x): + x = x[:,None,:,:] + x = self.conv1(x) + x = self.act(x) + x = x.view(x.shape[0],-1) + x = self.fc1(x) + x = self.act1(x) + x = self.dropout1(x) + x = self.fc2(x) + x = self.act2(x) + x = self.dropout2(x) + x = self.fc3(x) + return x + +model = None +loss_fn = None + +def cum_loss_and_logits(data, label): + global model, loss_fn, SEQ_LEN + logits = model(data) + loss = loss_fn(logits, label) + return loss, logits + +def build_model(args): + global CLASS, SEQ_LEN, POS_EMBED_USING, model + #load the label stored + with open('label_dict', 'rb') as fp: + label_dict = pkl.load(fp) + model = PerformerLM( + num_tokens = CLASS, + dim = 200, + depth = 6, + max_seq_len = SEQ_LEN, + heads = 10, + ) + args = parse() + # 加载预训练权重 + ckpt_file_name = args.model_path + param_dict = ms.load_checkpoint(ckpt_file_name) + # 将权重加载到模型中 + ms.load_param_into_net(model, param_dict) + # 设置参数是否参与梯度计算 + for param in model.trainable_params(): + param.requires_grad = False + for param in model.norm.trainable_params(): + param.requires_grad = True + for param in model.performer.layers[-2].trainable_params(): + param.requires_grad = True + # 覆盖输出层 + model.to_out = Identity(dropout=0.1, h_dim=128, out_dim=label_dict.shape[0]) + print("build model success.") + count = sum([ item.size for item in model.get_parameters()]) + names = [item.name for item in model.trainable_params()] + print("param count is {}, names: {}, count: {}".format(count, str(names), len(names))) + + if args.enable_pipeline: + model.init_pipeline(0) + model.performer.layers[0].init_pipeline(1) + model.performer.layers[0].attention.init_pipeline(1) + return + +def build_optimizer_and_scheduler(model): + global LEARNING_RATE, PAD_TOKEN_ID, loss_fn, optimizer + # optimizer + optimizer = Adam(params=model.trainable_params(), learning_rate=LEARNING_RATE) + # loss + loss_fn = CrossEntropyLoss(weight=None) + print("build optimizer success.") + return optimizer + +def train_one_epoch(train_dataloader, grad_fn, optimizer): + global model + running_loss = 0.0 + cum_acc = 0.0 + model.set_train(True) + for _, (data, label) in enumerate(tqdm(train_dataloader.create_tuple_iterator())): + # forward 推理 + (loss, logits), grads = grad_fn(data, label) + optimizer(grads) + # 累加损失 + running_loss += loss.item() + # 计算精度 + final = ops.softmax(logits) + final = final.argmax(axis=-1) + # 预测数 + pred_num = Tensor([final.shape[-1]], ms.int32) + # 计算正确数 + correct_num = ops.Equal()(final, label).sum(axis=-1) + # 计算累计准确率 + cum_acc += correct_num / pred_num.mean() + del data, label, final + + return running_loss, cum_acc + +# 从 Tensor 对象中提取整数值 +def get_value_from_tensor(tensor_list): + return [tensor.asnumpy()[0] for tensor in tensor_list] + +def eval_one_epoch(val_dataloader): + global loss_fn, model, SEQ_LEN + model.set_train(False) + predictions = [] + truths = [] + running_loss = 0.0 + print("========== 开始验证") + for _, (data,label) in enumerate(tqdm(val_dataloader.create_tuple_iterator())): + logits = model(data) + loss = loss_fn(logits, label) + running_loss += loss.item() + softmax = nn.Softmax(axis=-1) + final_prob = softmax(logits) + final = final_prob.argmax(axis=-1) + predictions.append(final) + truths.append(label) + del data, logits, final + val_loss = running_loss / len(val_dataloader) + # 获取 truths 和 predictions 的实际值 + truths_values = get_value_from_tensor(truths) + predictions_values = get_value_from_tensor(predictions) + # 计算正确率 + correct_count = sum(t == p for t, p in zip(truths_values, predictions_values)) + total_count = len(truths_values) + val_acc = correct_count / total_count if total_count > 0 else 0 + # 计算正确数 + del predictions, truths + return val_loss, val_acc + +def train(optimizer, train_dataloader, val_dataloader): + global EPOCHS,VALIDATE_EVERY, MODEL_NAME, loss_fn + + train_num_step = len(train_dataloader) + grad_fn = value_and_grad(cum_loss_and_logits, grad_position=None, weights=model.trainable_params(), has_aux=True) + for epoch in range(EPOCHS): + running_loss, cum_acc = train_one_epoch(train_dataloader, grad_fn, optimizer) + # log epoch的信息 + epoch_loss = running_loss / train_num_step + epoch_acc = 100 * cum_acc / train_num_step + + # 确保将Tensor转换为Python数值 + epoch_loss_value = epoch_loss.asnumpy().item() if isinstance(epoch_loss, ms.Tensor) else epoch_loss + epoch_acc_value = epoch_acc.asnumpy().item() if isinstance(epoch_acc, ms.Tensor) else epoch_acc + + log_string = f' == Epoch: {epoch} | Training Loss: {epoch_loss_value:.6f} | Accuracy: {epoch_acc_value:6.4f}% ==' + print(log_string) + with open('finetune_result.txt', 'a') as f: + f.write(log_string + '\n') + + # 进行一次验证 + if epoch % VALIDATE_EVERY == 0: + val_loss, val_acc = eval_one_epoch(val_dataloader) + log_string = f' == Epoch: {epoch} | Validation Loss: {val_loss} | Accuracy: {val_acc.item()}% ==' + print(log_string) + with open('finetune_result.txt', 'a') as f: + f.write(log_string + '\n') + + ckpt_dir = "./" + FINETUNE_SAVE_PATH + if not os.path.exists(ckpt_dir): + os.makedirs(ckpt_dir, exist_ok=True) + ckpt_file = f"finetune-{epoch}.ckpt" + ckpt_path = os.path.join(ckpt_dir, ckpt_file) + save_checkpoint(model, ckpt_path) + +def parse(): + parser = argparse.ArgumentParser() + parser.add_argument("--enable_pipeline", type=bool, default=False, help='Local process rank.') + parser.add_argument("--device_id", type=int, default=-1, help='Local process rank.') + parser.add_argument("--bin_num", type=int, default=5, help='Number of bins.') + parser.add_argument("--gene_num", type=int, default=16906, help='Number of genes.') + parser.add_argument("--epoch", type=int, default=100, help='Number of epochs.') + parser.add_argument("--seed", type=int, default=2021, help='Random seed.') + parser.add_argument("--batch_size", type=int, default=1, help='Number of batch size.') + parser.add_argument("--learning_rate", type=float, default=1e-4, help='Learning rate.') + parser.add_argument("--grad_acc", type=int, default=60, help='Number of gradient accumulation.') + parser.add_argument("--valid_every", type=int, default=1, help='Number of training epochs between twice validation.') + parser.add_argument("--pos_embed", type=bool, default=True, help='Using Gene2vec encoding or not.') + parser.add_argument("--data_path", type=str, default='./data/Zheng68k_prepeocessed.h5ad', help='Path of data for finetune.') + parser.add_argument("--model_path", type=str, default='./ckpt/ckpt-0.ckpt', help='Path of pretrained model.') + parser.add_argument("--ckpt_dir", type=str, default='./finetune_ckpts/', help='Directory of checkpoint to save.') + parser.add_argument("--model_name", type=str, default='finetune', help='Finetuned model name.') + args = parser.parse_args() + return args + +if __name__ == "__main__": + # 1. 解析命令行参数 + args = parse() + if args.enable_pipeline: + ms.set_context(mode=0, device_target="Ascend") + ms.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, pipeline_stages=2, pipeline_result_broadcast=True) + init() + ms.set_seed(1) + else: + ms.set_context(max_device_memory='29GB') + ms.set_context(mode=0, device_target="Ascend", device_id=0) + # 2. 声明全局变量 + SEED = args.seed + EPOCHS = args.epoch + BATCH_SIZE = args.batch_size + GRADIENT_ACCUMULATION = args.grad_acc + LEARNING_RATE = args.learning_rate + SEQ_LEN = args.gene_num + 1 + VALIDATE_EVERY = args.valid_every + PATIENCE = 10 + UNASSIGN_THRES = 0.0 + CLASS = args.bin_num + 2 + POS_EMBED_USING = args.pos_embed + FINETUNE_SAVE_PATH = args.ckpt_dir + # 3. 加载数据集 + train_dataloader, val_dataloader = load_data(args.data_path, CLASS, SEED, BATCH_SIZE) + # 4. 加载模型 + build_model(args) + # 4. 构建优化器和损失函数 + optimizer = build_optimizer_and_scheduler(model) + # 5. 开始训练 + train(optimizer, train_dataloader, val_dataloader) diff --git a/scBERT/layers.py b/scBERT/layers.py new file mode 100644 index 000000000..5c8f78f19 --- /dev/null +++ b/scBERT/layers.py @@ -0,0 +1,110 @@ +from mindspore.nn import Cell, Embedding, GELU, Dense, Dropout,ReLU, LayerNorm, Softmax +from mindspore import Tensor, ops +import numpy as np +import mindspore as ms +from utils import default +import numpy as np +from mindspore import Tensor + +class Gene2VecPositionalEmbedding(Cell): + def __init__(self, max_seq_len=16907): + super().__init__() + gene2vec_weight = np.load('./data/gene2vec_16906.npy') + gene2vec_weight = gene2vec_weight + gene2vec_weight = np.concatenate((gene2vec_weight, np.zeros((1, gene2vec_weight.shape[1]))), axis=0) + gene2vec_weight = Tensor(gene2vec_weight, dtype=ms.float32) + self.emb = Embedding(vocab_size=max_seq_len, embedding_size=200, embedding_table=gene2vec_weight, dtype=ms.float32) + self.emb.embedding_table.requires_grad=False + + def construct(self, x): + t = ops.arange(start=0, end=x.shape[1], dtype=ms.int32) + return self.emb(t) + +class BatchMatricMul(Cell): + def __init__(self, transpose_a=False, transpose_b=False): + super().__init__() + self.matmul = ops.BatchMatMul(transpose_a, transpose_b) + + def construct(self, a, b): + return self.matmul(a, b) + +class Add(Cell): + def __init__(self): + super().__init__() + self.add = ops.Add() + + def construct(self, a, b): + return self.add(a, b) + +class SelfAttention(Cell): + def __init__( + self, + dim, + heads = 8, + dim_head = 64, + dropout = 0., + ): + super().__init__() + assert dim % heads == 0, 'dimension must be divisible by number of heads' + self.dim_head = default(dim_head, dim // heads) + self.inner_dim = dim_head * heads + + self.heads = heads + self.reshape = ops.Reshape() + # stage 1 + self.to_q = Dense(in_channels=dim, out_channels=self.inner_dim, dtype=ms.float32, has_bias=False) + self.to_k = Dense(in_channels=dim, out_channels=self.inner_dim, dtype=ms.float32, has_bias=False) + self.to_v = Dense(in_channels=dim, out_channels=self.inner_dim, dtype=ms.float32, has_bias=False) + self.to_out = Dense(in_channels=self.inner_dim, out_channels=dim, dtype=ms.float32, has_bias=False) + self.dropout1 = Dropout(p=dropout+0.00000001) + + # stage 2 + self.matmul = BatchMatricMul(False, True) + self.softmax = Softmax(axis=-1) + self.mul = BatchMatricMul() + self.layer_norm = LayerNorm((dim,)) + self.w1 = Dense(in_channels=dim, out_channels=dim * 4, dtype=ms.float32) + self.act = GELU() + self.dropout2 = Dropout(p=dropout+0.000001) + self.w2 = Dense(in_channels=dim * 4, out_channels=dim, dtype=ms.float32) + self.add1 = Add() + self.add2= Add() + + def construct(self, x): + # (batch_size, 16906, 200, 10, 10) + b, n, _, h = *x.shape, self.heads + + # 这里就是 [bs, seq, hidden] -> [bs, head, seq, head_dim] + q, k, v = self.to_q(x), self.to_k(x), self.to_v(x) + q = self.reshape(q, (b, h, n, self.dim_head)) + k = self.reshape(k, (b, h, n, self.dim_head)) + v = self.reshape(v, (b, h, n, self.dim_head)) + score = self.matmul(q, k) + out = self.mul(self.softmax(score), v) + out = self.reshape(out, (b, n, self.inner_dim)) + attn_out = self.to_out(out) + attn_out = self.add1(x, attn_out) + x = self.layer_norm(attn_out) + x = self.w1(x) + x = self.act(x) + x = self.dropout1(x) + x = self.w2(x) + out = self.add2(attn_out,x) + return self.dropout2(attn_out) + + def init_pipeline(self): + self.to_q.pipeline_stage = 0 + self.to_k.pipeline_stage = 0 + self.to_v.pipeline_stage = 0 + self.dropout1.pipeline_stage = 1 + self.matmul.pipeline_stage = 1 + self.mul.pipeline_stage = 1 + self.softmax.pipeline_stage=1 + self.to_out.pipeline_stage = 2 + self.add1.pipeline_stage = 2 + self.w1.pipeline_stage=2 + self.act.pipeline_stage=2 + self.dropout1.pipeline_stage=2 + self.w2.pipeline_stage=3 + self.add2.pipeline_stage=3 + self.dropout2.pipeline_stage=3 \ No newline at end of file diff --git a/scBERT/performer.py b/scBERT/performer.py new file mode 100644 index 000000000..955e36b62 --- /dev/null +++ b/scBERT/performer.py @@ -0,0 +1,425 @@ +import math +import numpy as np +import mindspore as ms +import mindspore.common.dtype as mstype +import mindspore.nn as nn +import mindspore.ops.functional as F +from mindspore.ops import operations as P +from mindspore.common.tensor import Tensor +from mindspore.common.parameter import Parameter +from mindspore.common.initializer import initializer, Normal +from layers import Gene2VecPositionalEmbedding + +# helpers +def exists(val): + return val is not None +def empty(tensor): + return tensor.numel() == 0 +def default(val, d): + return val if exists(val) else d + +def softmax_kernel(data, projection_matrix, is_query=False, normalize_data=True, eps=1e-4): + """ + data:[Batch,Heads,Seq,Dim_head] + projection_matrix:[m,Dim_head] + + """ + b, h, Seq,Dim_head= data.shape + data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1. + ratio = (projection_matrix.shape[0] ** -0.5) + # W'*X + data_dash = data_normalizer * P.MatMul(transpose_b=True)(P.Reshape()(data,(-1,Dim_head)), projection_matrix) + data_dash = P.Reshape()(data_dash,(b,h,Seq,-1)) + # |X|^2/2 + diag_data = data ** 2 + diag_data = P.ReduceSum(keep_dims=True)(diag_data, -1) + diag_data = (diag_data / 2.0) * (data_normalizer ** 2) + #exp(W'x-|X|^2/2) + if is_query: + data_dash = ratio * ( + P.Exp()(data_dash - diag_data - + P.ReduceMax(keep_dims=True)(data_dash, -1)) + eps) + else: + data_dash = ratio * ( + P.Exp()(data_dash - diag_data - P.ReduceMax()(data_dash)) + eps) + + return data_dash + +def orthogonal_matrix_chunk(cols, qr_uniform_q = False): + unstructured_block = np.random.randn(cols, cols).astype(np.float32) + q, r = np.linalg.qr(unstructured_block, mode='reduced') + # proposed by @Parskatt + # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf + if qr_uniform_q: + d = np.diag(r, 0) + q *= np.sign(d) + # 转mindspore Tensor + q = np.transpose(q) + q = Tensor(q) + return q + +def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling = 0, qr_uniform_q = False): + nb_full_blocks = int(nb_rows / nb_columns) + block_list = [] + for _ in range(nb_full_blocks): + q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, ) + block_list.append(q) + remaining_rows = nb_rows - nb_full_blocks * nb_columns + if remaining_rows > 0: + q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, ) + block_list.append(q[:remaining_rows]) + final_matrix = P.Concat()(tuple(block_list)) + + if scaling == 0: + multiplier = Tensor(np.diag(np.linalg.norm(np.random.randn(nb_rows, nb_columns).astype(np.float32), axis = 1))) + elif scaling == 1: + multiplier = Tensor(np.diag(math.sqrt((float(nb_columns))) * np.ones((nb_rows,)))) + else: + raise ValueError(f'Invalid scaling {scaling}') + + return P.MatMul()(multiplier,final_matrix) + +class Softmax_kernel(nn.Cell): + def __init__(self): + super().__init__() + self.Reshape = P.Reshape() + self.MatMul_b = P.MatMul(transpose_b=True) + self.ReduceSum = P.ReduceSum(keep_dims=True) + self.Exp = P.Exp() + self.ReduceMax_keep = P.ReduceMax(keep_dims=True) + self.ReduceMax = P.ReduceMax() + def construct(self, data, projection_matrix, is_query=False, normalize_data=True, eps=1e-4): + """ + data:[Batch,Heads,Seq,Dim_head] + projection_matrix:[m,Dim_head] + + """ + b, h, Seq, Dim_head = data.shape + data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1. + ratio = (projection_matrix.shape[0] ** -0.5) + # W'*X + data_dash = data_normalizer * self.MatMul_b(self.Reshape(data, (-1, Dim_head)), projection_matrix) + data_dash = self.Reshape(data_dash, (b, h, Seq, -1)) + # |X|^2/2 + diag_data = data ** 2 + diag_data = self.ReduceMax_keep(diag_data, -1) + diag_data = (diag_data / 2.0) * (data_normalizer ** 2) + # exp(W'x-|X|^2/2) + if is_query: + data_dash = ratio * ( + self.Exp(data_dash - diag_data - + self.ReduceMax_keep(data_dash, -1)) + eps) + else: + data_dash = ratio * ( + self.Exp(data_dash - diag_data - self.ReduceMax(data_dash)) + eps) + + return data_dash + +class Linear_attention(nn.Cell): + def __init__(self): + super().__init__() + self.ReduceSum =P.ReduceSum(keep_dims=True) + self.BatchMatMul_b = P.BatchMatMul(transpose_b=True) + self.BatchMatMul_a = P.BatchMatMul(transpose_a=True) + self.BatchMatMul = P.BatchMatMul() + self.Mul = P.Mul() + def construct(self, q, k, v): + """ + k,q,v:[B,Sq,H] + """ + # [B,1,H] + k_cumsum = self.ReduceSum(k, -2) + # [B,Sq,1] + D_inv = 1. /self.BatchMatMul_b(q, k_cumsum) + # [B,H,H] + context = self.BatchMatMul_a(k, v) + # [B,Sq,H] + out = self.BatchMatMul(q, context) + # [B,Sq,H]*[B,Sq,1] -> + out = self.Mul(out, D_inv) + return out + +class Causal_linear_attention(nn.Cell): + def __init__(self): + super().__init__() + self.view_ = P.Reshape() + self.CumSum = P.CumSum() + self.ReduceSum =P.ReduceSum(keep_dims=True) + self.BatchMatMul_b = P.BatchMatMul(transpose_b=True) + self.BatchMatMul_a = P.BatchMatMul(transpose_a=True) + self.Mul = P.Mul() + def construct(self, q, k, v): + k_cumsum = self.CumSum(k, -2) + # [n,] + D_inv = 1. / self.ReduceSum(q * k_cumsum, -1) + # [n,d,1]*[n,1,e] -> [n,d,e] + context = self.BatchMatMul_b(self.view_(k, k.shape + (1,)), self.view_(v, v.shape + (1,))) + #[n,d,e] -> + context = self.CumSum(context,-3) + # [n,1,d] * [n,d,e] -> [n,1,e] = [n,e] + out = self.BatchMatMul_a(self.view_(q, q.shape + (1,)), context) + out = self.view_(out, v.shape) + out = self.Mul(out, D_inv) + return out + +class LayerNorm(nn.Cell): + """ + Layer Normalization + + Args: + normalized_shape: the corresponding shape of the normalized axes + eps: epsilon, a small number avoiding zero division + + Inputs: + x: input tensor + + Returns: + rescaled_output: Tensor, returned tensor after layernorm + """ + def __init__(self, normalized_shape, eps=1e-5): + super(LayerNorm, self).__init__() + self.gamma = Parameter(initializer('ones', normalized_shape), name="gamma") + self.beta = Parameter(initializer('zeros', normalized_shape), name="beta") + self.mean = P.ReduceMean(keep_dims=True) + self.eps = eps + + def construct(self, x): + mean = self.mean(x, -1) + variance = self.mean(F.square(x - mean), -1) + output = (x - mean) / F.sqrt(variance + self.eps) + rescaled_output = output * self.gamma + self.beta + return rescaled_output + +class FeedForward(nn.Cell): + def __init__(self, dim, + mult = 4, + initializer_range=0.02, + hidden_dropout_prob=0.1, + compute_type=mstype.float32): + super(FeedForward,self).__init__() + self.hidden_size = dim + self.w1 = Mapping(dim,dim*mult,initializer_range,compute_type) + self.w2 = Mapping(dim * mult,dim,initializer_range,compute_type) + self.act = nn.GELU() + self.dropout = nn.Dropout(hidden_dropout_prob) + def construct(self, x): + x = self.w1(x) + x = self.act(x) + x = self.w2(x) + x = self.dropout(x) + return x + +class Mapping(nn.Cell): + """ + A mapping function with a 3d input + Args: + input_size: the size of the last dimension of the input tensor + output_size: the desired size of the last dimension of the output tensor + dtype: the compute datatype + scale: the scale factor for initialization + Inputs: + x: the 3d input + Returns: + output: Tensor, a 3d tensor after projection + """ + def __init__(self, input_size, output_size,initializer_range=0.02, dtype=ms.float32, scale=1.0): + super(Mapping, self).__init__() + self.output_size = output_size + self.input_size = input_size + self.weight = Parameter(initializer(Normal(sigma=initializer_range*scale), [input_size, output_size]),name="Weight") + self.bias = Parameter(initializer("zeros", [output_size,]),name="Bias") + self.dtype = dtype + self.cast = P.Cast() + + def construct(self, x): + out_shape = P.Shape()(x)[:-1] + (self.output_size,) + x = P.Reshape()(x, (-1, self.input_size)) + x = nn.MatMul()(x, self.cast(self.weight, self.dtype)) + self.cast(self.bias, self.dtype) + output = P.Reshape()(x, out_shape) + return output + +class FastAttention(nn.Cell): + def __init__(self, dim_heads, nb_features = None, ortho_scaling = 0, causal = False, qr_uniform_q = False): + super(FastAttention, self).__init__() + nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) + self.dim_heads = dim_heads + self.nb_features = nb_features + self.ortho_scaling = ortho_scaling + ## projection_matrix is buffer + self.projection_matrix = gaussian_orthogonal_random_matrix(nb_rows=self.nb_features, + nb_columns=dim_heads, + scaling=ortho_scaling, + qr_uniform_q=qr_uniform_q) + self.causal = causal + self.attn_fn = Linear_attention() if not self.causal else Causal_linear_attention() + self.softmax_kernel = Softmax_kernel() + def construct(self, q, k, v): + q = self.softmax_kernel(data=q, projection_matrix=self.projection_matrix, is_query=True) + k = self.softmax_kernel(data=k, projection_matrix=self.projection_matrix, is_query=False) + out = self.attn_fn(q, k, v) + return out + +class SelfAttention(nn.Cell): + def __init__(self, dim, heads, dim_head, causal=False, nb_features=None, qr_uniform_q = False, dropout = 0.9): + super(SelfAttention,self).__init__() + assert dim % heads == 0, 'dimension must be divisible by number of heads' + self.dim_head = dim_head + self.fast_attention = FastAttention(dim_heads=self.dim_head, nb_features=nb_features, causal=causal, qr_uniform_q=qr_uniform_q) + self.heads = heads + self.to_q = Mapping(dim, dim) + self.to_k = Mapping(dim, dim) + self.to_v = Mapping(dim, dim) + self.to_out = Mapping(dim, dim) + self.dropout = nn.Dropout(dropout) + self.view = P.Reshape() + self.Concat = P.Concat(axis=1) + self.Mul = P.Mul() + self.ExpandDims = P.ExpandDims() + self.Tile = P.Tile() + def construct(self, x): + """ + #b:batch_size + #h:num_heads + #n:seq_len + #d:dim_perhead + """ + b, n, dim, = x.shape + h = self.heads + + q, k, v = self.to_q(x), self.to_k(x), self.to_v(x) + q, k, v = self.view(q, (b,h,n,self.dim_head)), self.view(k, (b,h,n,self.dim_head)), self.view(v, (b,h,n,self.dim_head)) + + out = self.fast_attention(q, k, v) + out = self.view(out, (b,n,h* self.dim_head)) + out = self.to_out(out) + + return self.dropout(out) + +class EmbeddingLookup(nn.Cell): + """ + A embeddings lookup table with a fixed dictionary and size. + + Args: + vocab_size (int): Size of the dictionary of embeddings. + embedding_size (int): The size of each embedding vector. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + """ + def __init__(self, + vocab_size, + embedding_size, + use_one_hot_embeddings=False, + initializer_range=0.02): + super(EmbeddingLookup, self).__init__() + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embedding_table = Parameter(initializer(Normal(sigma=initializer_range), + [vocab_size, embedding_size]), name="embedding_table") + self.expand = P.ExpandDims() + self.shape_flat = (-1,) + self.gather = P.GatherV2() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = P.Shape() + + def construct(self, input_ids): + """Get a embeddings lookup table with a fixed dictionary and size.""" + input_shape = self.shape(input_ids) + + flat_ids = self.reshape(input_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) + output_for_reshape = self.array_mul(one_hot_ids, self.embedding_table) + else: + output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) + + out_shape = input_shape + (self.embedding_size,) + output = self.reshape(output_for_reshape, out_shape) + return output + +class AbsolutePositionalEmbedding(nn.Cell): + def __init__(self, dim, max_seq_len): + super(AbsolutePositionalEmbedding, self).__init__() + self.emb = nn.EmbeddingLookup(max_seq_len, dim) + + def construct(self, x): + batch_size, seq_length = x.shape[0], x.shape[1] + input_position = F.tuple_to_array(F.make_range(seq_length)) + # input_position = P.Tile()(input_position, (batch_size, 1)) + return self.emb(input_position) + + +class Performer_layer(nn.Cell): + def __init__(self,dim, heads, dim_head, causal=False, nb_features=None, qr_uniform_q = False, dropout = 0.9): + super(Performer_layer, self).__init__() + self.SelfAttention = SelfAttention(dim, heads, dim_head, causal, nb_features, qr_uniform_q, dropout) + self.FeedForward = FeedForward(dim=dim) + self.LayerNorm = LayerNorm(dim,) + def construct(self, x): + + x = self.LayerNorm(x) + out = x + self.SelfAttention(x) + out = self.LayerNorm(out) + out = out + self.FeedForward(x) + return out + +class Performer(nn.Cell): + def __init__(self,dim, depth, heads, causal=False, nb_features=None, qr_uniform_q = False, dropout = 0.9): + super(Performer, self).__init__() + assert dim % heads == 0 + dim_head = dim//heads + layers = [] + for _ in range(depth): + layers.append(Performer_layer(dim=dim, heads=heads, + dim_head=dim_head, + causal=causal, + nb_features=nb_features, + qr_uniform_q=qr_uniform_q, + dropout=dropout )) + + self.layers = nn.CellList(layers) + + def construct(self, input_tensor): + prev_output = input_tensor + for layer_module in self.layers: + prev_output = layer_module(prev_output) + return prev_output + +class PerformerLM(nn.Cell): + def __init__(self, num_tokens, max_seq_len, dim, depth, heads, causal = True, + nb_features = None, emb_dropout = 0.9, pf_dropout = 0.9, qr_uniform_q = False): + super(PerformerLM,self).__init__() + self.max_seq_len = max_seq_len + self.dim = dim + self.num_tokens = num_tokens + self.token_emb = EmbeddingLookup(num_tokens, dim) + # self.pos_emb = AbsolutePositionalEmbedding(dim, max_seq_len) + self.pos_emb = Gene2VecPositionalEmbedding() + self.dropout = nn.Dropout(emb_dropout) + self.performer = Performer(dim, depth, heads, causal, nb_features, qr_uniform_q, pf_dropout ) + self.norm = LayerNorm(dim) + self.MatMul = P.MatMul(transpose_b=True) + self.Reshape = P.Reshape() + self.to_out = nn.Dense(dim, num_tokens, dtype=ms.float32) + def construct(self, input_ids): + # b, n = input_ids.shape + # assert n <= self.max_seq_len, f'sequence length {n} must be less than the max sequence length {self.max_seq_len}' + # token and positional embeddings + + x = self.token_emb(input_ids) + x += self.pos_emb(x) + x = self.dropout(x) + x = self.performer(x) + # norm and to logits + #[batch,seq,hidden] + x = self.norm(x) + # res = self.MatMul(self.Reshape(x,(-1,self.dim)), self.token_emb.embedding_table) + # return self.Reshape(res, input_ids.shape+(self.num_tokens,)) + # 5. (batch, 16906, 200) -> (batch, 16906, 7) + # 输出层 + x = self.to_out(x) + return x \ No newline at end of file diff --git a/scBERT/predict.py b/scBERT/predict.py new file mode 100644 index 000000000..a3c68d689 --- /dev/null +++ b/scBERT/predict.py @@ -0,0 +1,129 @@ +import argparse +import numpy as np +from dataset_finetune import load_data +from performer import PerformerLM +from mindspore.nn import Adam, CrossEntropyLoss +from tqdm import tqdm +from mindspore import ops, save_checkpoint, Tensor +import math +from functools import reduce +import mindspore as ms +from mindspore import value_and_grad, ParallelMode, nn +from mindspore.communication import init +from mindspore import Profiler +import pickle as pkl +from sklearn.metrics import accuracy_score +import logging +import scanpy as sc + +class Identity(nn.Cell): + def __init__(self, dropout = 0.1, h_dim = 100, out_dim = 10): + super(Identity, self).__init__() + self.conv1 = nn.Conv2d(1, 1, (1,200), pad_mode='valid', padding=0, has_bias=False) + self.act = nn.ReLU() + self.fc1 = nn.Dense(in_channels=SEQ_LEN, out_channels=512, has_bias=True) + self.act1 = nn.ReLU() + self.dropout1 = nn.Dropout(dropout) + self.fc2 = nn.Dense(in_channels=512, out_channels=h_dim, has_bias=True) + self.act2 = nn.ReLU() + self.dropout2 = nn.Dropout(dropout) + self.fc3 = nn.Dense(in_channels=h_dim, out_channels=out_dim, has_bias=True) + + def construct(self, x): + x = x[:,None,:,:] + # [batch, 1, seq_len, 200] + x = self.conv1(x) + # [batch, 1, seq_len, 1] + x = self.act(x) + x = x.view(x.shape[0],-1) + x = self.fc1(x) + x = self.act1(x) + x = self.dropout1(x) + x = self.fc2(x) + x = self.act2(x) + x = self.dropout2(x) + x = self.fc3(x) + return x + +def parse(): + parser = argparse.ArgumentParser() + parser.add_argument("--enable_pipeline", type=bool, default=False, help='Local process rank.') + parser.add_argument("--device_id", type=int, default=-1, help='Local process rank.') + parser.add_argument("--bin_num", type=int, default=5, help='Number of bins.') + parser.add_argument("--gene_num", type=int, default=16906, help='Number of genes.') + parser.add_argument("--epoch", type=int, default=100, help='Number of epochs.') + parser.add_argument("--seed", type=int, default=2021, help='Random seed.') + parser.add_argument("--pos_embed", type=bool, default=True, help='Using Gene2vec encoding or not.') + parser.add_argument("--data_path", type=str, default='./data/Zheng68k_prepeocessed.h5ad', help='Path of data for predict.') + parser.add_argument("--model_path", type=str, default='./ckpt/ckpt-0.ckpt', help='Path of finetuned model.') + args = parser.parse_args() + return args + +if __name__ == "__main__": + # 配置日志记录到文件 'prediction_log.log' + logging.basicConfig( + filename='prediction_log.log', + filemode='a', # 'a' 表示追加模式,如果要覆盖日志文件,可以使用 'w' + format='%(asctime)s - %(levelname)s - %(message)s', + level=logging.INFO + ) + + # 解析命令行参数 + args = parse() + if args.enable_pipeline: + ms.set_context(mode=0, device_target="Ascend") + ms.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, pipeline_stages=2, pipeline_result_broadcast=True) + init() + ms.set_seed(1) + else: + ms.set_context(variable_memory_max_size='29GB') + ms.set_context(mode=0, device_target="Ascend", device_id=0) + + # 声明全局变量 + SEED = args.seed + EPOCHS = args.epoch + SEQ_LEN = args.gene_num + 1 + CLASS = args.bin_num + 2 + POS_EMBED_USING = args.pos_embed + + # 读预测数据集 + data = sc.read_h5ad(args.data_path) + # 标签字典 + with open('label_dict', 'rb') as fp: + label_dict = pkl.load(fp) + data = data.X[:10] + + # 加载模型 + model = PerformerLM( + num_tokens = CLASS, + dim = 200, + depth = 6, + max_seq_len = SEQ_LEN, + heads = 10, + # local_attn_heads = 0, + # g2v_position_emb = True + ) + model.to_out = Identity(dropout=0.1, h_dim=128, out_dim=label_dict.shape[0]) + path = args.model_path + ckpt = ms.load_checkpoint(path) + ms.load_param_into_net(model, ckpt) + for param in model.trainable_params(): + param.requires_grad = False + + batch_size = data.shape[0] + model.set_train(False) + pred_finals = [] + for index in range(batch_size): + full_seq = data[index].toarray()[0] + full_seq[full_seq > (CLASS - 2)] = CLASS - 2 + full_seq = np.append(full_seq, 0).astype(np.int32) + full_seq = Tensor(full_seq).astype(ms.int32) # 转换为 MindSpore Tensor + full_seq = ops.expand_dims(full_seq, 0) # 在第 0 维度添加一个维度,类似于 unsqueeze + pred_logits = model(full_seq) + pred_prob = ops.softmax(pred_logits, axis = -1) + pred_final = pred_prob.argmax(axis=-1) + pred_finals.append(pred_final) + pred_list = [] + for pred_final in pred_finals: + pred_list.append(label_dict[pred_final]) + logging.info(f"Predictions: {pred_list}") \ No newline at end of file diff --git a/scBERT/pretrain.py b/scBERT/pretrain.py new file mode 100644 index 000000000..3fca59bc6 --- /dev/null +++ b/scBERT/pretrain.py @@ -0,0 +1,303 @@ +import argparse +from dataset_pretrain import load_data +from performer import PerformerLM +from mindspore.nn import Adam, CrossEntropyLoss +from tqdm import tqdm +from mindspore import ops, save_checkpoint, Tensor +import math +from functools import reduce +import mindspore as ms +from mindspore import value_and_grad +from mindspore.communication import init +from mindspore.communication.management import get_rank +from mindspore import nn +from mindspore.nn import ExponentialDecayLR +import logging + +model = None +loss_fn = None + +def prob_mask_like(t, prob): + return ops.uniform(t.shape, Tensor(0, dtype=ms.float32), Tensor(1, dtype=ms.float32)).float() < prob + +def mask_with_tokens(t, token_ids): + init_no_mask = ops.full_like(t, False, dtype=ms.uint8) + mask = reduce(lambda acc, el: acc | (t == el), token_ids, init_no_mask) + return Tensor(mask, dtype=ms.uint8) + +def get_mask_subset_with_prob(mask, prob): + batch, seq_len = mask.shape + max_masked = math.ceil(prob * seq_len) # num of mask of a single sequence in average + num_tokens = mask.sum(axis=-1, keepdims=True) # num of pure tokens of each sequence except special tokens + mask_excess = ops.cat((ops.zeros(size=(batch), dtype=ms.float32), ops.arange(1, seq_len,dtype=ms.float32).repeat(batch))).reshape(batch,seq_len) + mask_excess = (mask_excess >= (num_tokens * prob).ceil()) # only 15% of pure tokens can be masked + mask_excess = ops.Reshape()(mask_excess, (batch, seq_len)) + mask_excess = mask_excess[:, :max_masked] # get difference between 15% of pure tokens and 15% of all tokens + rand = ops.rand((batch, seq_len)).masked_fill(~mask, -1e9) # rand (0-1) as prob, special token use -1e9 + _, sampled_indices = rand.topk(max_masked, dim=-1) # get index of topk prob to mask + sampled_indices = (sampled_indices + 1).masked_fill(mask_excess, 0) # delete difference of mask not pure + new_mask = ops.zeros((batch, seq_len + 1), dtype=ms.uint8) # get (batch, seq_len) shape zero matrix + new_mask = new_mask.scatter(-1, sampled_indices, ops.ones(shape=ops.shape(sampled_indices), dtype=ms.uint8)) # set masks in zero matrix as 1 + new_mask = ops.Cast()(new_mask, ms.uint8) + return new_mask[:, 1:] # the final mask, True is mask + +def data_mask(data, + mask_prob=None, + replace_prob=None, + num_tokens=None, + random_token_prob=None, + mask_token_id=None, + pad_token_id=None, + mask_ignore_token_ids=None +): + global MASK_PROB, REPLACE_PROB, RANDOM_TOKEN_PROB, MASK_TOKEN_ID, PAD_TOKEN_ID, MASK_IGNORE_TOKEN_IDS + replace_prob = REPLACE_PROB + mask_prob= MASK_PROB + random_token_prob = RANDOM_TOKEN_PROB + mask_token_id = MASK_TOKEN_ID + pad_token_id = PAD_TOKEN_ID + mask_ignore_token_ids = MASK_IGNORE_TOKEN_IDS + + mask_ignore_token_ids = set([*mask_ignore_token_ids, pad_token_id]) + # do not mask [pad] tokens, or any other tokens in the tokens designated to be excluded ([cls], [sep]) + # also do not include these special tokens in the tokens chosen at random + no_mask = mask_with_tokens(data, mask_ignore_token_ids) # ignore_token as True, will not be masked later + mask = get_mask_subset_with_prob(~no_mask, mask_prob) # get the True/False mask matrix + # get mask indices + ## mask_indices = torch.nonzero(mask, as_tuple=True) # get the index of mask(nonzero value of mask matrix) + # mask input with mask tokens with probability of `replace_prob` (keep tokens the same with probability 1 - replace_prob) + masked_input = data + # if random token probability > 0 for mlm + if random_token_prob > 0: + assert num_tokens is not None, 'num_tokens keyword must be supplied when instantiating MLM if using random token replacement' + random_token_prob = prob_mask_like(data, random_token_prob) # get the mask matrix of random token replace + random_tokens = ops.randint(0, num_tokens, data.shape) # generate random token matrix with the same shape as input + random_no_mask = mask_with_tokens(random_tokens, mask_ignore_token_ids) # not masked matrix for the random token matrix + random_token_prob &= ~random_no_mask # get the pure mask matrix of random token replace + random_indices = ops.nonzero(random_token_prob, as_tuple=True) # index of random token replace + masked_input[random_indices] = random_tokens[random_indices] # replace some tokens by random token + # [mask] input + replace_prob = prob_mask_like(data, replace_prob) # get the mask matrix of token being masked + masked_input = masked_input.masked_fill(ops.Cast()(mask * replace_prob, ms.bool_), mask_token_id) # get the data has been masked by mask_token + # mask out any tokens to padding tokens that were not originally going to be masked + labels = data.masked_fill(~mask, pad_token_id) # the label of masked tokens + return masked_input, labels + +def build_model(args): + global CLASS, SEQ_LEN, POS_EMBED_USING, model + model = PerformerLM( + num_tokens = CLASS, # 7 + dim = 200, + depth = 1, + max_seq_len = SEQ_LEN, # 16907 + heads = 10, + #local_attn_heads = 0, + ) + print("build model success.") + count = sum([ item.size for item in model.get_parameters()]) + names = [item.name for item in model.trainable_params()] + + print("param count is {}, names: {}, count: {}".format(count, str(names), len(names))) + + if args.enable_pipeline: + model.init_pipeline() + model.performer.layers[0].init_pipeline() + model.performer.layers[0].attention.init_pipeline() + return model + + +def build_optimizer_and_scheduler(model): + global LEARNING_RATE, PAD_TOKEN_ID, loss_fn, optimizer + # optimizer + optimizer = Adam(params=model.trainable_params(), learning_rate=LEARNING_RATE) + loss_fn = CrossEntropyLoss(ignore_index = PAD_TOKEN_ID, reduction='mean') + print("build optimizer success.") + return optimizer, loss_fn + +def train_one_epoch(train_dataloader, grad_fn, optimizer, pp_grad_reducer): + global PAD_TOKEN_ID, model, PIPELINE, BATCH_SIZE, SEQ_LEN, DP, lr_schedule + running_loss = 0.0 + cum_acc = 0.0 + model.set_train(True) + correct_num = 0 + val_num = 0 + for index, (data,) in enumerate(tqdm(train_dataloader.create_tuple_iterator())): + data, orig_labels = data_mask(data) + labels = ops.repeat_elements(orig_labels, rep=7, axis=-1) + labels = ops.cast(labels, dtype=ms.float32) + if (labels.shape[0] % BATCH_SIZE) !=0: + continue + labels = ops.reshape(labels, (BATCH_SIZE,SEQ_LEN, 7)) + if PIPELINE: + loss, grads = grad_fn(data, labels) + grads = pp_grad_reducer(grads) + elif DP: + (loss, logits), grads = grad_fn(data, labels) + grads = pp_grad_reducer(grads) + else: + (loss, logits), grads = grad_fn(data, labels) + optimizer(grads) + lr = lr_schedule(index) + optimizer.learning_rate = lr + # 累加损失 + running_loss += loss.item() / (SEQ_LEN*BATCH_SIZE*7) + # 计算精度 + if not PIPELINE: + labels = ops.repeat_elements(orig_labels, rep=7, axis=-1) + labels = ops.reshape(labels, (-1, SEQ_LEN, 7)) + labels = ops.cast(labels, dtype=ms.float32) + final = ops.softmax(logits, axis=-1)[..., 1:-1] # (bs, seq_len, 7) + final = final.argmax(axis=-1) + 1 # # (bs, seq_len) + correct_num += ops.mul(Tensor(orig_labels!=PAD_TOKEN_ID, dtype=ms.uint8), Tensor(final == orig_labels, dtype=ms.uint8)).sum(axis=-1).sum() + val_num += Tensor(orig_labels != PAD_TOKEN_ID, dtype=ms.uint8).sum(axis=-1).sum() + del data, labels, logits, final, orig_labels + + return running_loss, 100 * correct_num / val_num + +def eval_one_epoch(val_dataloader): + global PAD_TOKEN_ID, loss_fn, model, SEQ_LEN + model.set_train(False) + predictions = [] + truths = [] + running_loss = 0.0 + print("========== 开始验证") + correct_num = 0 + val_num = 0 + for _, (data,) in enumerate(tqdm(val_dataloader.create_tuple_iterator())): + data, ori_labels = data_mask(data) + ori_labels = ops.cast(ori_labels, ms.float32) + labels = ops.repeat_elements(ori_labels, rep=7, axis=-1) + labels = ops.reshape(labels, (-1, SEQ_LEN, 7)) + labels = ops.cast(labels, dtype=ms.float32) + logits = model(data) + loss = loss_fn(logits, labels) + running_loss += loss.item() / (SEQ_LEN*BATCH_SIZE*7) + final = ops.softmax(logits, axis=-1)[..., 1:-1] + final = final.argmax(axis=-1) + 1 + correct_num += ops.mul(Tensor(ori_labels!=PAD_TOKEN_ID, dtype=ms.uint8), Tensor(final == ori_labels, dtype=ms.uint8)).sum(axis=-1).sum() + val_num += Tensor(ori_labels != PAD_TOKEN_ID, dtype=ms.uint8).sum(axis=-1).sum() + del data, labels, logits, final, ori_labels + val_loss = running_loss / len(val_dataloader) + val_acc = 100 * correct_num / val_num + del predictions, truths + return val_loss, val_acc + +def train(optimizer, train_dataloader, val_dataloader): + global EPOCHS,VALIDATE_EVERY, MODEL_NAME, loss_fn, PIPELINE, DP + + train_num_step = len(train_dataloader) + if PIPELINE: + grad_fn = value_and_grad(forward_pipeline, grad_position=None, weights=optimizer.parameters) + pp_grad_reducer = nn.PipelineGradReducer(optimizer.parameters) + elif DP: + grad_fn = value_and_grad(forward, grad_position=None, weights=model.trainable_params(), has_aux=True) + pp_grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean=True) + else: + grad_fn = value_and_grad(forward, grad_position=None, weights=model.trainable_params(), has_aux=True) + pp_grad_reducer = None + + for epoch in range(EPOCHS): + running_loss, cum_acc = train_one_epoch(train_dataloader, grad_fn, optimizer, pp_grad_reducer) + # log epoch的信息 + epoch_loss = running_loss / train_num_step + logging.info(f' == Epoch: {epoch} | Training Loss: {epoch_loss:.6f} | Accuracy: {cum_acc.item():6.4f}% ==') + + # 进行一次验证 + if epoch % VALIDATE_EVERY == 0: + val_loss, val_acc = eval_one_epoch(val_dataloader) + logging.info(f' == Epoch: {epoch} | Validation Loss: {val_loss} | Accuracy: {val_acc.item()}% ==') + if get_rank() == 0: + # 存模型 + ckpt_dir = "./" + PRETRAIN_PATH + if not os.path.exists(ckpt_dir): + os.makedirs(ckpt_dir, exist_ok=True) + ckpt_file = f"pretrain-{epoch}.ckpt" + ckpt_path = os.path.join(ckpt_dir, ckpt_file) + save_checkpoint(model, ckpt_path) + +def forward_pipeline(data, label): + global net_with_loss + return net_with_loss(data,label) + +def forward(data, label): + global model, loss_fn + logits = model(data) + loss = loss_fn(logits, label) + return loss, logits + +def parse(): + parser = argparse.ArgumentParser() + parser.add_argument("--enable_pipeline", type=bool, default=True, help='Local process rank.') + parser.add_argument("--device_id", type=int, default=-1, help='Local process rank.') + parser.add_argument("--bin_num", type=int, default=5, help='Number of bins.') + parser.add_argument("--gene_num", type=int, default=16906, help='Number of genes.') + parser.add_argument("--epoch", type=int, default=100, help='Number of epochs.') + parser.add_argument("--seed", type=int, default=2021, help='Random seed.') + parser.add_argument("--batch_size", type=int, default=4, help='Number of batch size.') + parser.add_argument("--learning_rate", type=float, default=1e-4, help='Learning rate.') + parser.add_argument("--valid_every", type=int, default=1, help='Number of training epochs between twice validation.') + parser.add_argument("--mask_prob", type=float, default=0.15, help='Probability of masking.') + parser.add_argument("--replace_prob", type=float, default=0.9, help='Probability of replacing with [MASK] token for masking.') + parser.add_argument("--pos_embed", type=bool, default=True, help='Using Gene2vec encoding or not.') + parser.add_argument("--data_path", type=str, default='./data/panglao_10000.h5ad', help='Path of data for pretraining.') + parser.add_argument("--model_name", type=str, default='panglao_pretrain', help='Pretrained model name.') + args = parser.parse_args() + return args + +if __name__ == "__main__": + # 创建日志记录器,文件保存日志,同时控制台也输出日志 + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler("training_log.log", mode='a'), # 保存日志到文件 + logging.StreamHandler() # 同时输出日志到控制台 + ] + ) + + args = parse() + args.enable_pipeline = False + args.enable_dp = True + if args.enable_pipeline: + ms.set_context(mode=0, device_target="Ascend") + ms.reset_auto_parallel_context() + ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, pipeline_stages=4, pipeline_result_broadcast=True) + init() + ms.set_seed(1) + elif args.enable_dp: + ms.set_context(mode=0, device_target="Ascend", max_device_memory="29GB") + ms.reset_auto_parallel_context() + ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) + init() + ms.set_seed(1) + else: + ms.set_context(mode=0, device_target="Ascend", device_id=1) + + SEED = args.seed + EPOCHS = args.epoch + BATCH_SIZE = args.batch_size + LEARNING_RATE = args.learning_rate + SEQ_LEN = 5000 + VALIDATE_EVERY = args.valid_every + CLASS = args.bin_num + 2 + MASK_PROB = args.mask_prob + REPLACE_PROB = args.replace_prob + RANDOM_TOKEN_PROB = 0. + MASK_TOKEN_ID = CLASS - 1 + PIPELINE = args.enable_pipeline + PAD_TOKEN_ID = CLASS - 1 + MASK_IGNORE_TOKEN_IDS = [0] + POS_EMBED_USING = args.pos_embed + MODEL_NAME = args.model_name + DP = args.enable_dp + train_dataloader, val_dataloader = load_data(args.data_path, CLASS, SEED, BATCH_SIZE, SEQ_LEN, args) + model = build_model(args) + + optimizer,loss_fn = build_optimizer_and_scheduler(model) + if args.enable_pipeline: + global net_with_loss + net_with_loss = nn.PipelineCell(nn.WithLossCell(model, loss_fn), micro_size=4) + net_with_loss.set_train() + global lr_schedule + lr_schedule = ExponentialDecayLR(learning_rate=0.1, decay_rate=0.9, decay_steps=100) + train(optimizer, train_dataloader, val_dataloader) \ No newline at end of file diff --git a/scBERT/run_distribute_finetune.sh b/scBERT/run_distribute_finetune.sh new file mode 100644 index 000000000..36e993d62 --- /dev/null +++ b/scBERT/run_distribute_finetune.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run.sh" +echo "==============================================================================================================" + + +msrun --worker_num=8 --local_worker_num=8 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 finetune.py diff --git a/scBERT/run_distribute_pretrain.sh b/scBERT/run_distribute_pretrain.sh new file mode 100644 index 000000000..d941a94aa --- /dev/null +++ b/scBERT/run_distribute_pretrain.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run.sh" +echo "==============================================================================================================" + + +msrun --worker_num=8 --local_worker_num=8 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 pretrain.py diff --git a/scBERT/utils.py b/scBERT/utils.py new file mode 100644 index 000000000..5e864e94a --- /dev/null +++ b/scBERT/utils.py @@ -0,0 +1,22 @@ +from contextlib import contextmanager + +def exists(val): + return val is not None +def empty(tensor): # mindspore的Tensor.size 返回 张量中元素的个数 + return tensor.size == 0 +def default(val, d): + return val if exists(val) else d +@contextmanager +def null_context(): + yield +def cast_tuple(val): + return (val,) if not isinstance(val, tuple) else val +def find_modules(nn_module, module_type): + return [module for module in nn_module.cells() if isinstance(module, module_type)] # mindspore通过Cell.cells() 获取到子cell。 + +def route_args(router, pos_emb, depth): + routed_args = [ dict() for _ in range(depth)] + + for i in range(depth): + routed_args[depth] = ({"pos_emb":pos_emb},{}) + return routed_args \ No newline at end of file -- Gitee