diff --git a/contrib/Overlap-Recovery/README.md b/contrib/Overlap-Recovery/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df2f000ea4ec1b926217027f6b2bc409863e86f3 --- /dev/null +++ b/contrib/Overlap-Recovery/README.md @@ -0,0 +1,303 @@ +# Overlap-Recovery重叠文本还原参考设计 + +## 1 介绍 + +本开发样例使用自研算法完成重叠文本的还原任务,供用户参考。 本系统基于昇腾Ascend310卡。本仓库是重叠文本识别任务([Overlap-CRNN](https://gitee.com/ascend/mindxsdk-referenceapps/tree/master/contrib/Overlap-CRNN))的上游任务,即完成对重叠文本还原并输出文本实例的mask。 + +### 1.1 支持的产品 + +本系统采用Atlas300-3010作为实验验证的硬件平台,并支持Atlas200RC以及Atlas500的硬件平台,具体产品实物图和硬件参数请参见《Atlas 300 AI加速卡 用户指南(型号 3010)》。由于采用的硬件平台为含有Atlas 300的Atlas 800 AI服务器 (型号3010),而服务器一般需要通过网络访问,因此需要通过笔记本或PC等客户端访问服务器,而且展示界面一般在客户端。 + +### 1.2 支持的版本 + +版本号查询方法,在Ascend产品环境下,运行命令: + +``` +npu-smi info +``` + + + +### 1.3 软件方案介绍 + +软件方案主要为文本还原的系统,子系统功能具体描述请参考 表1.1 系统方案各子系统功能描述。重叠文本还原子系统可以实现还原重叠文本并得到各个文本实例的mask,本方案选择使用基于分割的算法并提出一种重叠区域感知的模块来恢复出重叠文本实例。系统方案中各模块功能如表1.2 所示。 + +表1.1 系统方案各子系统功能描述: + +| 序号 | 子系统 | 功能描述 | +| :--: | :----------------: | :----------------------------------------------------------: | +| 1 | 重叠文本还原子系统 | 重叠文本还原子系统将得到重叠文本实例的mask的结果,之后将结果送入到下游的文字识别模型进行文字识别。 | + +表1.2 系统方案中各模块功能: + +| 序号 | 子系统 | 功能描述 | +| :--: | :--------: | :----------------------------------------------------------: | +| 1 | 输入图像 | 将图像(JPG/PNG格式)通过Pillow库读入。 | +| 2 | 图像解码 | 通过Pillow第三方库对图像解码。 | +| 3 | 图像放缩 | 模型的输入为固定尺寸,所以需要对输入图片进行等比例放缩。 | +| 4 | 文字还原 | 在图像放缩后,将缓存区数据送入文字还原模型。本方案选用自研算法进行文本还原 | +| 5 | 结果可视化 | 通过Pillow库可视化单张图像的预测的文本实例mask。 | + + + +### 1.4 代码目录结构与说明 + +eg:本sample工程名称为`Overlap-Recovery`,工程根目录如下图所示: + +```pytnon +├── train #训练代码的文件夹 +├── inference #推理代码的文件夹 +``` + +其中,`Overlap-Recovery/train`工程目录如下图所示: + +```pytnon +├── eval.py #精度测试 +├── train.py #模型训练主函数 +├── export.py #将ckpt模型导出为onnx格式的模型 +├── __init__.py +├── src #模型源码及相关辅助函数 +│ ├── __init__.py +│ ├── dataset #数据集加载、预处理等相关函数 +│ │ ├── __init__.py +│ │ ├── base_dataset.py #dataset类的基类 +│ │ ├── build_dataset.py #提供接口构造dataset对象 +│ │ ├── data_process.py #数据预处理相关函数 +│ │ ├── real_dataset.py #用于测试数据的dataset类 +│ │ ├── synth_dataset.py #用于训练数据的dataset类 +│ │ ├── utils.py #dataset构造所需的辅助函数 +│ ├── deoccluder #去重叠算法相关代码 +│ │ ├── __init__.py +│ │ ├── deoccluder_r50.py #模型主结构代码 +│ │ ├── fpn_neck.py # FPN模块代码 +│ │ ├── resnet.py # resnet-50 backbone代码 +│ │ ├── utils.py # 辅助函数 +│ │ ├── rpn # kernel初始化相关 +│ │ │ ├── __init__.py +│ │ │ ├── kernel_head.py # kernel初始化相关函数 +│ │ │ ├── positional_encoding.py # 位置编码函数 +│ │ │ ├── semantic_fpn_warpper.py # 语义fpn编码 +│ │ ├── roi # kernel更新相关 +│ │ │ ├── __init__.py +│ │ │ ├── custom_kernel_iter_head.py # kernel迭代函数 +│ │ │ ├── custom_kernel_update_head.py # kernel更新函数 +│ │ │ ├── kernel_update_head.py # kernel更新函数基类 +│ │ │ ├── kernel_updator.py # kernel更新辅助函数 +│ │ ├── custom_cells # 算法组件 +│ │ │ ├── __init__.py +│ │ │ ├── custom_assigner.py # 标签分配函数 +│ │ │ ├── custom_blocks.py # 自定义模块 +│ │ │ ├── custom_losses.py # 自定义损失函数 +│ │ │ ├── custom_match_cost.py # 自定义匹配代价评估函数 +│ │ │ ├── custom_operations.py # 自定义算子 +│ │ │ ├── custom_samplers.py # 自定义采样函数 +│ ├── model_utils # 模型训练相关代码 +│ │ ├── __init__.py +│ │ ├── device_adapter.py +│ │ ├── local_adapter.py +│ │ ├── moxing_adapter.py +│ │ ├── configs # 配置文件函数 +│ │ │ ├── __init__.py +│ │ │ ├── config_base.py +│ │ │ ├── config_model.py +│ ├── utils # 将pytorch权重转为mindspore权重 +│ │ └── pth2ckpt.py +├── scripts # scripts文件 +│ ├── convert_resnet.sh # 将pytorch的resnet权重转为mindspore权重 +│ └── train.sh # 训练指令 +├── resource_utils # 转换pytorch权重所需的相关材料 +│ └──resnet50_dict.json +``` + + +其中,`Overlap-Recovery/inference`工程目录如下图所示: + +```pytnon +├── eval.py #精度测试 +├── eval_utils.py #指标计算的辅助函数 +├── load_ann.py #加载测试集 +├── load_img_data.py #加载图片数据 +├── ominfer.py #单张图片推理 +├── preprocess_utils.py #加载图片做预处理的辅助函数 +├── README.md +├── models #不同类型的模型文件 +│ ├── best_iou.onnx +│ └── best_iou.ckpt +│ └── best_iou.om +├── dataset #测试数据集 +│ ├── img +│ └── annotation.json +``` + +### 1.5 技术实现流程图 + +实现流程图如下图所示: + +![image-20221201214655261](./inference/流程图.png) + + + +### 1.6 特性及适用场景 + +本案例中的还原模型适用于常规图像的文本,并可以返回测试图像的文本区域的IOU指标。 + +本模型在以下几种情况还原效果良好:图像中文字清晰可见、排版工整、字符大小适中等。 + +在以下几种情况去噪效果不太好:图像中文字模糊、排版随意、字符较小等。 + + + +## 2 环境依赖 + +下面列出环境依赖软件和版本。 + +推荐系统为ubuntu 18.04或centos 7.6。 + +其中训练环境依赖软件和版本如下表: + +| 软件名称 | 版本 | +| ------------------- | ----------- | +| MindX SDK | 3.0RC3 | +| Ascend-CANN-toolkit | 6.0.RC1 | +| ubuntu | 18.04.1 LTS | +| python | 3.9.2 | +| MindSpore | 1.9.0 | +| opencv-python | 4.6.0.66 | +| numpy | 1.23.1 | +| pillow | 9.1.0 | +| mmcv | 0.2.14 | +| loguru | 0.2.14 | +| tqdm | 4.64.1 | +| imagesize | 1.4.1 | +| terminaltables | 3.1.10 | + + +其中推理环境依赖软件和版本如下表: + +| 软件名称 | 版本 | +| ------------------- | ----------- | +| MindX SDK | 3.0RC3 | +| Ascend-CANN-toolkit | 6.0.RC1 | +| ubuntu | 18.04.1 LTS | +| python | 3.9.2 | +| cv2 | 4.5.5.64 | +| numpy | 1.23.1 | +| pillow | 9.1.0 | +| mmcv-full | 1.7.0 | + +在运行推理项目前,需要设置环境变量: + +- 环境变量介绍 + +``` +. ${sdk_path}/set_env.sh +. ${ascend_toolkit_path}/set_env.sh +``` + + + +## 3 模型训练 + +**步骤1** 从pytorch官方下载[resnet-50预训练权重](https://download.pytorch.org/models/resnet50-19c8e357.pth) +,并利用脚本转换成mindspore支持的格式 +``` +sh train/scripts/convert_resnet.sh PATH-TO-PYTORCH-WEIGHT PATH-TO-MINDSPORE-WEIGHT +``` + +**步骤2** 准备数据集并修改相关config参数 + + 在```train/src/model_utils/config_base.py```中,修改```pretrained_r50```参数为转换后的backbone权重路径, +并参考测试数据的格式准备好训练数据,修改```synth_data_root```和```real_data_root```等参数。此外,可通过修改```mindrecord_dir``` +参数设置输出路径。 + + +**步骤3** 按照环境依赖要求配置好训练所需运行环境后,执行如下命令启动模型训练。 + + ``` + python train/train.py + ``` + +**步骤4** 使用训练好的mindspore模型直接推理 + + 修改```train/src/model_utils/config_base.py```中```checkpoint_path```参数为checkpoint的保存路径,执行如下命令推理。 + + ``` + python train/eval.py + ``` + + + +## 4 模型转换 + + +通过第三节的训练后得到ckpt模型文件,在项目运行前需要先将ckpt文件通过 `export.py `转换成ONNX模型文件,然后在本代码仓下通过ATC将ONNX转换成om模型,其中`ckpt->onnx`的转换在训练环境下进行(参考第2节所述),`onnx->om`的转换在推理环境下进行(参考第2节所述)。 + +模型转换工具(ATC)相关介绍如下:[ATC介绍](https://support.huawei.com/enterprise/zh/doc/EDOC1100234054) + +具体步骤如下: + +1. 准备好训练得到的ckpt模型文件,放至服务器上`Overlap-Recovery/train/models`文件夹下,环境同训练环境相同(硬件包含CPU,参考第2节所述)。 + +2. 进入`Overlap-Recovery/train`文件夹下,修改`export.py`文件中`ckpt_file_path`和`file_name`参数为自己的路径,执行如下命令完成`ckpt->onnx`的模型转换: + + ``` + cd train + python export.py + ``` + +3. 将生成的ONNX模型转移到推理服务器,放至在`Overlap-Recovery/inference/models`路径下,环境同推理环境相同(硬件为Ascend 310,参考第2节述所)。 + +4. 进入推理服务器执行如下命令(修改`onnx_model_path`和`output_model_path`参数为自己的路径)完成`onnx->om`的模型转换: + + ``` + cd inference/models + atc --model=[onnx_model_path] --framework=5 --output=[output_model_path] --soc_version=Ascend310 --input_shape="img:1,3,768,768" + ``` + +5. 执行该命令会在当前目录下生成项目需要的模型文件`[output_model].om`。执行后终端输出为: + + ``` + ATC start working now, please wait for a moment. + ATC run success, welcome to the next use. + ``` + +表示命令执行成功。 + +相关模型的下载链接如下:[models.zip](https://mindx.sdk.obs.cn-north-4.myhuaweicloud.com/mindxsdk-referenceapps%20/contrib/Overlap-Recovery/models.zip)。 +将模型按照提供的文件夹目录放至即可。 + +## 5 模型推理 + +当已有模型的om文件,保存在`Overlap-Recovery/inference/models/`下,推理所需环境如第2节所述。 + +示例步骤如下: + +**步骤1** 将任意一张待预测的图片存到当前目录下(`./Overlap-Recovery/inference`),文件名修改为`test`。 + +**步骤2** 按照第4节模型转换获取om模型,放置在`Overlap-Recovery/inference/models/`路径下。若未自行转换模型,使用的是仓库提供的模型,则无需修改相关文件,否则修改`ominfer.py`中相关配置,将`model_path`对象的路径改成实际的om模型的路径;`img_prefix`和`img_name`对象的路径改成实际的测试图片的路径;`save_path`对象设置成需要保存可视化图像的路径。 + +**步骤3** 在命令行输入 如下命令运行单张图片模型推理: + +``` +cd inference +python ominfer.py +``` + +**步骤4** 运行结束输出`test`文件夹,预测的mask可视化结果保存在`test`文件夹下。 + + + +## 6 测试精度 + +**步骤1** 在`Overlap-Recovery/inference/dataset/`路径下准备相同格式的数据集(已提供测试用的数据集,按照文件目录放至即可:[dataset.zip](https://mindx.sdk.obs.cn-north-4.myhuaweicloud.com/mindxsdk-referenceapps%20/contrib/Overlap-CRNN/dataset.zip)) + +**步骤2** 在命令行输入 如下命令运行精度测试: + +``` +cd inference +python eval.py +``` + +模型在测试集上的精度达标,最终模型的的精度为84.2%,满足精度要求(≥80%)。 + +![image-20221202155839483](./inference/测试结果.png) \ No newline at end of file diff --git a/contrib/Overlap-Recovery/inference/.gitignore b/contrib/Overlap-Recovery/inference/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5bad0a4adced5daa442a0bfd7208f00836d33541 --- /dev/null +++ b/contrib/Overlap-Recovery/inference/.gitignore @@ -0,0 +1,149 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +.idea +.DS_Store +ominfer_testcase.py +eval_test_ckpt.py +eval_utils_in.py +#./test/0.png +#./test/1.png +#test/input.jpg \ No newline at end of file diff --git a/contrib/Overlap-Recovery/inference/dataset/.gitkeep b/contrib/Overlap-Recovery/inference/dataset/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/Overlap-Recovery/inference/eval.py b/contrib/Overlap-Recovery/inference/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..74c81438d0e1c989db510e65f819e0839e61e950 --- /dev/null +++ b/contrib/Overlap-Recovery/inference/eval.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import warnings +from PIL import Image +import numpy as np +from mindx.sdk import base +from mindx.sdk.base import Tensor, Model, Size, log, ImageProcessor, post, BTensor +from eval_utils import evaluate_metric +from load_ann import load_annotations +from load_img_data import load_img_data +warnings.filterwarnings('ignore') + +DEVICE_ID = 1 # 芯片ID +ANN_FILE_PATH = './dataset/annotation.json' # 标签路径 +IMG_PREFIX_PATH = './dataset' # 图片根路径 +SEG_MASK_PREFIX_PATH = './dataset' # mask根路径 +INFER_MODEL_PATH = "models/best_iou.om" # 模型的路径 + + +class OverlapDataset: + + def __init__(self, annotation_file, img_prefix_path, seg_prefix): + self.data_list = load_annotations(annotation_file, img_prefix_path, seg_prefix) + self.img_prefix = img_prefix_path + self.seg_prefix = seg_prefix + self.sample_num = len(self.data_list) + print(f"There are totally {self.sample_num} samples") + + def __len__(self): + return self.sample_num + + def __getitem__(self, item): + data_item = self.data_list[item] + img_name = data_item['filename'] + img_tensor, img_meta = load_img_data(img_name, self.img_prefix) # hwc-chw + img_meta['seg_map_path'] = data_item['seg_map_path'] + return img_tensor, img_meta + + +def prepare_model(model_path, device_id): + base.mx_init() # 全局资源初始化 + model = Model(model_path, device_id) # 创造模型对象 + return model + + +def postprocess(scaled_mask_preds, cls_score): + num_imgs = 1 + segm_results = [] + segm_scores = [] + for img_id in range(num_imgs): + cls_score_per_img = cls_score[img_id] # num_det, 1 + topk_indices = np.argsort(cls_score_per_img.flatten())[::-1][:4] + scores_per_img = cls_score_per_img.flatten()[topk_indices] + mask_indices = topk_indices + masks_per_img = scaled_mask_preds[img_id][mask_indices] # b, num_det, h,w + seg_masks = masks_per_img > 0.5 + seg_result, segm_score = segm2result(seg_masks, scores_per_img) + segm_results.append(seg_result) + segm_scores.append(segm_score) + # bs, num_det, h, w + segm_results = np.stack(segm_results) + # bs, num_det, 1 + segm_scores = np.stack(segm_scores) + return segm_results, segm_scores + + +def segm2result(mask_preds, cls_scores): + segm_result = [] + seg_scores = [] + num_ins = mask_preds.shape[0] # num_dets, h, w + for idx in range(num_ins): + segm_result.append(mask_preds[idx]) + seg_scores.append(cls_scores[idx]) + # here we only have one classes (text) + segm_result = np.stack(segm_result) # num_det, h, w + seg_scores = np.stack(seg_scores) # num_det + return segm_result, seg_scores + + +def evaluate(ann_file, img_prefix, seg_mask_prefix, model_path): + # dataset + dataset = OverlapDataset(ann_file, img_prefix, seg_mask_prefix) + sample_num = dataset.sample_num + dataset = iter(dataset) + + # model + model = prepare_model(model_path, DEVICE_ID) + + # inference + results = [] + img_metas_list = [] + for idx in range(sample_num): + resize_img, img_meta = next(dataset) + print(f'sample {idx}') + + # prepare image + resize_img = np.expand_dims(resize_img, 0) # add batch dim, 1,3,h,w + resize_img = np.ascontiguousarray(resize_img) + image_tensor = Tensor(resize_img) # 推理前需要转换为tensor的List,使用Tensor类来构建。 + image_tensor.to_device(DEVICE_ID) # !!!!!重要,需要转移至device侧,该函数单独执行 + image_tensor_list = [image_tensor] # 推理前需要转换为tensor的List + + # forward + outputs = model.infer(image_tensor_list) + + # preds Tensor to numpy + outputs[0].to_host() + outputs[0] = np.array(outputs[0]) + outputs[1].to_host() + outputs[1] = np.array(outputs[1]) + + pred_masks, pred_scores = outputs[0], outputs[1] # (1, 4, h, w), (1, 4, 1) + pred_masks, pred_scores = postprocess(pred_masks, pred_scores) # (1, 4, h, w), (1, 4) + + # remove padding area + resize_shape = img_meta['img_shape'][:2] # h,w + pred_masks = pred_masks[:, :, :resize_shape[0], :resize_shape[1]] + + # rescaled to original size + ori_size = img_meta['ori_shape'][:2] # h,w + pred_masks = pred_masks[0] # removed batch dim + rescaled_masks = [] + for tmp_idx in range(pred_masks.shape[0]): + img = pred_masks[tmp_idx] + pil_image = Image.fromarray(img) + pil_image = pil_image.resize((ori_size[1], ori_size[0])) + resized_img = np.array(pil_image) + rescaled_masks.append(resized_img) + rescaled_masks = np.stack(rescaled_masks) + + rescaled_masks = np.expand_dims(rescaled_masks, 0) + result = (pred_scores, rescaled_masks) + results.append(result) + img_metas_list.append(img_meta) + # evaluate + eval_res = evaluate_metric(results, img_metas_list, score_thresh=0.2, ) + text_iou = np.around(eval_res.get("text_iou", 0), decimals=3) + print("==============================") + print("精度测试结果如下:") + print(f'text_iou: {text_iou * 100}%') + print("==============================") + + +if __name__ == '__main__': + evaluate(ANN_FILE_PATH, IMG_PREFIX_PATH, SEG_MASK_PREFIX_PATH, INFER_MODEL_PATH) \ No newline at end of file diff --git a/contrib/Overlap-Recovery/inference/eval_utils.py b/contrib/Overlap-Recovery/inference/eval_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2ae6dcd8129e1e3197d6db291b1ade23e8e27b7e --- /dev/null +++ b/contrib/Overlap-Recovery/inference/eval_utils.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import cv2 + + +def cal_mask_iou(mask_a, mask_b, check_valid=False): + if check_valid: + assert len(np.unique(mask_a)) <= 2 + assert len(np.unique(mask_b)) <= 2 + a_bool = mask_a.astype(np.bool) + b_bool = mask_b.astype(np.bool) + intersection_area = (a_bool & b_bool).sum() + union_area = (a_bool | b_bool).sum() + if union_area == 0: + return 0 + return intersection_area / union_area + + +def cal_overlap_mask(mask_list): + if len(mask_list) < 2: + return None + mask_list_bool = [x.astype(np.bool) for x in mask_list] + overlap_mask = np.zeros_like(mask_list_bool[0]) + for ii in range(len(mask_list_bool) - 1): + for jj in range(ii + 1, len(mask_list_bool)): + cur_olp = mask_list_bool[ii] & mask_list_bool[jj] + overlap_mask = overlap_mask | cur_olp + return overlap_mask + + +def cal_union_mask(mask_list): + if len(mask_list) < 1: + return None + mask_list_bool = [x.astype(np.bool) for x in mask_list] + union_mask = np.zeros_like(mask_list_bool[0]) + for mask_bool in mask_list_bool: + union_mask = union_mask | mask_bool + return union_mask + + + +def eval_func(box_scores, masks, img_meta, score_thresh=0.2, iou_thresh=0.5): + # prepare gt + gt_masks = [cv2.imread(x, cv2.IMREAD_UNCHANGED) // 255 for x in img_meta['seg_map_path']] + for mask_ in gt_masks: + if len(mask_.shape) > 2: + import ipdb + ipdb.set_trace() + print(gt_masks) + gt_text = cal_union_mask(gt_masks) + gt_overlap = cal_overlap_mask(gt_masks) + # prepare predict of overlap and text area + + # select top 2 prediction + box_scores = box_scores[0] # remove batch dim + scores = box_scores.tolist() + valid_idx = [] + for ins_idx, score in enumerate(box_scores): + if score > score_thresh: + valid_idx.append(ins_idx) + pred_masks = [masks[0][_] for _ in valid_idx] + if len(pred_masks) == 0: + pred_overlap = np.zeros_like(masks[0][0]) + pred_text = np.zeros_like(masks[0][0]) + elif len(pred_masks) == 1: + pred_overlap = np.zeros_like(masks[0][0]) + pred_text = cal_union_mask(pred_masks) + else: + pred_overlap = cal_overlap_mask(pred_masks) + pred_text = cal_union_mask(pred_masks) + + if len(gt_masks) > 1: + # calculate metrics + intersection_text = (pred_text & gt_text).sum() + union_text = (pred_text | gt_text).sum() + intersection_overlap = (pred_overlap & gt_overlap).sum() + union_overlap = (pred_overlap | gt_overlap).sum() + else: + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + + # prepare predict of text instance + # filter out invalid prediction + valid_idx = [] + for ins_idx, score in enumerate(box_scores): + if score > score_thresh: + valid_idx.append(ins_idx) + match_matrix = np.zeros((len(valid_idx), len(gt_masks)), dtype=np.bool) + for ins_idx, tmp_valid_idx in enumerate(valid_idx): + for gt_ins_idx, tmp_gt_mask in enumerate(gt_masks): + if match_matrix[:, gt_ins_idx].sum() > 0: + continue + # calculate IoU + if cal_mask_iou(masks[0][tmp_valid_idx], tmp_gt_mask) > iou_thresh: + match_matrix[ins_idx, gt_ins_idx] = True + break + # calculate instance-wise mIoU + text_ins_miou = 0 + if match_matrix.sum() > 0: + for ins_idx in range(max(match_matrix.shape)): + if ins_idx >= match_matrix.shape[0]: + # miss det + continue + else: + if ins_idx >= match_matrix.shape[1] or match_matrix[ins_idx].sum() == 0: + # wrong det + continue + else: + pred_mask = masks[0][valid_idx[ins_idx]].astype(np.bool) + gt_idx = match_matrix[ins_idx].nonzero()[0][0] + gt_mask = gt_masks[gt_idx].copy() + cur_iou = cal_mask_iou(pred_mask, gt_mask) + text_ins_miou += cur_iou + return (intersection_text, union_text, intersection_overlap, union_overlap), \ + text_ins_miou, max(match_matrix.shape) + + +def evaluate_metric(results, + img_metas, + score_thresh=0.2, + iou_thrs=0.5, + ): + + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + text_ins_miou_list = [] + total_ins_num = 0 + for idx, ((box_scores, masks), img_meta) in enumerate(zip(results, img_metas)): + overall_iou_metrics, text_ins_miou, ins_num = eval_func(box_scores, masks, img_meta, score_thresh, iou_thrs) + intersection_text += overall_iou_metrics[0] + union_text += overall_iou_metrics[1] + intersection_overlap += overall_iou_metrics[2] + union_overlap += overall_iou_metrics[3] + text_ins_miou_list.append(text_ins_miou) + total_ins_num += ins_num + + metric_results = dict( + text_iou=intersection_text / union_text, + ) + + return metric_results + diff --git a/contrib/Overlap-Recovery/inference/load_ann.py b/contrib/Overlap-Recovery/inference/load_ann.py new file mode 100644 index 0000000000000000000000000000000000000000..adcd67eae3b319aa16b7a7d94409e1258a153285 --- /dev/null +++ b/contrib/Overlap-Recovery/inference/load_ann.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os.path as osp +import imagesize + + +def load_annotations(ann_file_path, img_prefix_path, seg_prefix_path): + """Load annotation from Overlap""" + data_result_list = [] + img_dir = img_prefix_path + seg_dir = seg_prefix_path + if osp.isfile(ann_file_path): + with open(ann_file_path, 'r', encoding='utf-8') as f: + info_list = json.load(f) + for info_ in info_list: + assert len(info_) == 3, f"Invalid line: {info_}" + img_name = info_['img_name'] + data_info = dict(img_path=osp.join(img_dir, img_name)) + data_info['data_type'] = info_['data_type'] + data_info['filename'] = img_name + width, height = imagesize.get(data_info.get('img_path', '')) + data_info['width'] = width + data_info['height'] = height + seg_map_path = [] + text_labels = [] + bboxes = [] + # should follow a pre-defined order, e.g., from top layer to bottom + for text_ins in info_['texts']: + x, y, w, h = text_ins['bbox'] + bbox = [x, y, x + w, y + h] + bboxes.append(bbox) + seg_map_path.append(osp.join(seg_dir, text_ins[f"mask"])) + text_labels.append(text_ins['label']) + data_info['bboxes'] = bboxes + data_info['seg_map_path'] = seg_map_path + data_info['text_labels'] = text_labels + data_result_list.append(data_info) + else: + raise NotImplementedError + return data_result_list diff --git a/contrib/Overlap-Recovery/inference/load_img_data.py b/contrib/Overlap-Recovery/inference/load_img_data.py new file mode 100644 index 0000000000000000000000000000000000000000..190095e2da948fc3a350cf7f5f43311708839baf --- /dev/null +++ b/contrib/Overlap-Recovery/inference/load_img_data.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from preprocess_utils import build_processor + + +img_scale = (768, 768) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=img_scale), + dict(type='HWCToCHW', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +preprocessor = build_processor(test_pipeline) + + +def load_img_data(img_name_path, img_prefix_path=None): + + img_info = {'filename':img_name_path} + img_data = {'img_prefix':img_prefix_path, 'img_info': img_info} + + resized_img_data = preprocessor(img_data) + resize_img = resized_img_data.get('img', '') + img_metas = resized_img_data.get('img_metas', '') + return resize_img[0], img_metas[0] diff --git a/contrib/Overlap-Recovery/inference/models/.gitkeep b/contrib/Overlap-Recovery/inference/models/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/Overlap-Recovery/inference/ominfer.py b/contrib/Overlap-Recovery/inference/ominfer.py new file mode 100644 index 0000000000000000000000000000000000000000..b55c852494b0e6cc4781e14a7be110fd9669ca17 --- /dev/null +++ b/contrib/Overlap-Recovery/inference/ominfer.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import shutil +import warnings +import numpy as np +import cv2 +from mindx.sdk import base +from mindx.sdk.base import Tensor, Model, Size, log, ImageProcessor, post, BTensor +from load_img_data import load_img_data +from PIL import Image +warnings.filterwarnings('ignore') + +DEVICE_ID = 1 # 芯片ID +MODEL_PATH = "models/best_iou.om" # 模型的路径 +INFER_IMG_PREFIX = './' +IMG_NAME = 'test.jpg' +SAVE_PATH = './' + + +def om_infer_one(img_name_path, img_prefix=None, vis_dir=None, score_thr=0.4): + + if not os.path.exists(MODEL_PATH): + print("The input model path is empty!!!") + print("plz place the model in ./Overlap-Recovery/inference/models/") + exit() + + base.mx_init() # 全局资源初始化 + model = Model(MODEL_PATH, DEVICE_ID) # 创造模型对象 + + if not os.path.exists(os.path.join(img_prefix, img_name_path)): + print("The input image path is empty!!!") + print("plz place the image in ./Overlap-Recovery/inference/") + exit() + + if cv2.imread(os.path.join(img_prefix, img_name_path)) is None: + print("=============!Error!================") + print("The input image is empty, plz check out!") + print("====================================") + exit() + + resize_img, img_meta = load_img_data(img_name_path, img_prefix) # hwc-chw + ori_filename = img_meta['ori_filename'] + abs_filename = img_meta['filename'] + print(f"ori_filename: {img_meta['ori_filename']}") + print(f"filename: {img_meta['filename']}") + # h,w,c + print(f"ori_shape: {img_meta['ori_shape']} " + f"resize_shape: {img_meta['img_shape']} " + f"padded_shape: {img_meta['pad_shape']}") + resize_img = np.expand_dims(resize_img, 0) # add batch dim, 1,3,h,w + resize_img = np.ascontiguousarray(resize_img) + image_tensor = Tensor(resize_img) # 推理前需要转换为tensor的List,使用Tensor类来构建。 + image_tensor.to_device(DEVICE_ID) # !!!!!重要,需要转移至device侧,该函数单独执行 + image_tensor_list = [image_tensor] # 推理前需要转换为tensor的List + outputs = model.infer(image_tensor_list) + + # preds Tensor to numpy + outputs[0].to_host() + outputs[0] = np.array(outputs[0]) + outputs[1].to_host() + outputs[1] = np.array(outputs[1]) + + pred_masks, pred_scores = outputs[0], outputs[1] # (1, 4, h, w), (1,4) / (1, 4, 1) + pred_masks, pred_scores = postprocess(pred_masks, pred_scores) + print(f"pred_masks_shape: {pred_masks.shape} pred_score_shape: {pred_scores.shape}") + print(f"original pred unique value: {np.unique(pred_masks)}") + + # remove padding area + resize_shape = img_meta['img_shape'][:2] # h, w + pred_masks = pred_masks[:, :, :resize_shape[0], :resize_shape[1]] + + ori_size = img_meta['ori_shape'][:2] # h, w + + # remove batch dim + pred_masks, pred_scores = pred_masks[0], pred_scores[0] # (4, h, w), (4) + + img_id = os.path.basename(ori_filename).split('.')[0] + if vis_dir is not None: + save_dir = os.path.join(vis_dir, img_id) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + shutil.copyfile(abs_filename, os.path.join(save_dir, f"input.{os.path.basename(ori_filename).split('.')[1]}")) + for instance_idx in range(pred_masks.shape[0]): + text_instance = pred_masks[instance_idx] + pred_score = pred_scores[instance_idx] + + if pred_score < score_thr: + continue + + text_instance = text_instance.astype(np.uint8) + area = np.sum(text_instance) + print(f"pred_text_instance: {instance_idx+1} pred_score: {pred_score} " + f"unique value: {np.unique(text_instance)} area: {area}") + + pred_mask = Image.fromarray(text_instance * 255) + pred_mask = pred_mask.resize((ori_size[1], ori_size[0]))# w,h + + if vis_dir is not None: + save_file = os.path.join(save_dir, f'{instance_idx}.png') + pred_mask.save(save_file, bit=1) + print(f'pred text mask saving to {save_file}') + + +def postprocess(scaled_mask_preds, cls_score): + num_imgs = 1 + segm_results = [] + segm_scores = [] + for img_id in range(num_imgs): + cls_score_per_img = cls_score[img_id] # num_det, 1 + topk_indices = np.argsort(cls_score_per_img.flatten())[::-1][:4] + scores_per_img = cls_score_per_img.flatten()[topk_indices] + mask_indices = topk_indices + masks_per_img = scaled_mask_preds[img_id][mask_indices] # b, num_det, h,w + seg_masks = masks_per_img > 0.5 + seg_result, segm_score = segm2result(seg_masks, scores_per_img) + segm_results.append(seg_result) + segm_scores.append(segm_score) + # bs, num_det, h, w + segm_results = np.stack(segm_results) + # bs, num_det, 1 + segm_scores = np.stack(segm_scores) + return segm_results, segm_scores + + +def segm2result(mask_preds, cls_scores): + segm_result = [] + seg_scores = [] + num_ins = mask_preds.shape[0] # num_dets, h, w + for idx in range(num_ins): + segm_result.append(mask_preds[idx]) + seg_scores.append(cls_scores[idx]) + # here we only have one classes (text) + segm_result = np.stack(segm_result) # num_det, h, w + seg_scores = np.stack(seg_scores) # num_det + return segm_result, seg_scores + + +if __name__ == '__main__': + om_infer_one(IMG_NAME, INFER_IMG_PREFIX, vis_dir=SAVE_PATH) + diff --git a/contrib/Overlap-Recovery/inference/preprocess_utils.py b/contrib/Overlap-Recovery/inference/preprocess_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..34bf9516e50f554da31615853f0101a1adcffd25 --- /dev/null +++ b/contrib/Overlap-Recovery/inference/preprocess_utils.py @@ -0,0 +1,1008 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# code reference mmcv and mmdet +import collections +import warnings +import os.path as osp + +import numpy as np +import mmcv +from mmcv.utils import Registry, build_from_cfg + +PIPELINES = Registry('pipeline') + + +@PIPELINES.register_module() +class LoadImageFromFile: + """Load an image from file. + + Required keys are "img_prefix" and "img_info" (a dict that must contain the + key "filename"). Added or updated keys are "filename", "img", "img_shape", + "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`), + "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1). + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:`mmcv.imfrombytes`. + Defaults to 'color'. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + """ + + def __init__(self, to_float32=False, color_type='color', + channel_order='bgr', file_client_args=None): + self.to_float32 = to_float32 + self.color_type = color_type + self.channel_order = channel_order + file_client_args = file_client_args or dict(backend='disk') + self.file_client_args = file_client_args.copy() + self.file_client = None + + def __call__(self, results): + """Call functions to load image and get image meta information. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded image and meta information. + """ + + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + if results['img_prefix'] is not None: + filename = osp.join(results['img_prefix'], + results['img_info']['filename']) + else: + filename = results['img_info']['filename'] + + img_bytes = self.file_client.get(filename) + img = mmcv.imfrombytes( + img_bytes, flag=self.color_type, channel_order=self.channel_order) + if self.to_float32: + img = img.astype(np.float32) + + results['filename'] = filename + results['ori_filename'] = results['img_info']['filename'] + results['img'] = img + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + results['img_fields'] = ['img'] + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'to_float32={self.to_float32}, ' + f"color_type='{self.color_type}', " + f"channel_order='{self.channel_order}', " + f'file_client_args={self.file_client_args})') + return repr_str + + +@PIPELINES.register_module() +class Compose: + """Compose multiple transforms sequentially. + + Args: + transforms (Sequence[dict | callable]): Sequence of transform object or + config dict to be composed. + """ + + def __init__(self, transforms): + assert isinstance(transforms, collections.abc.Sequence) + self.transforms = [] + for transform in transforms: + if isinstance(transform, dict): + transform = build_from_cfg(transform, PIPELINES) + self.transforms.append(transform) + elif callable(transform): + self.transforms.append(transform) + else: + raise TypeError('transform must be callable or a dict') + + def __call__(self, data): + """Call function to apply transforms sequentially. + + Args: + data (dict): A result dict contains the data to transform. + + Returns: + dict: Transformed data. + """ + + for t in self.transforms: + data = t(data) + if data is None: + return None + return data + + def __repr__(self): + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + str_ = t.__repr__() + if 'Compose(' in str_: + str_ = str_.replace('\n', '\n ') + format_string += '\n' + format_string += f' {str_}' + format_string += '\n)' + return format_string + + +@PIPELINES.register_module() +class MultiScaleFlipAug: + """Test-time augmentation with multiple scales and flipping. + + An example configuration is as followed: + + .. code-block:: + + img_scale=[(1333, 400), (1333, 800)], + flip=True, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ] + + After MultiScaleFLipAug with above configuration, the results are wrapped + into lists of the same length as followed: + + .. code-block:: + + dict( + img=[...], + img_shape=[...], + scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)] + flip=[False, True, False, True] + ... + ) + + Args: + transforms (list[dict]): Transforms to apply in each augmentation. + img_scale (tuple | list[tuple] | None): Images scales for resizing. + scale_factor (float | list[float] | None): Scale factors for resizing. + flip (bool): Whether apply flip augmentation. Default: False. + flip_direction (str | list[str]): Flip augmentation directions, + options are "horizontal", "vertical" and "diagonal". If + flip_direction is a list, multiple flip augmentations will be + applied. It has no effect when flip == False. Default: + "horizontal". + """ + + def __init__(self, transforms, img_scale=None, scale_factor=None, flip=False): + flip_direction = 'horizontal' + self.transforms = Compose(transforms) + assert (img_scale is None) ^ (scale_factor is None), ( + 'Must have but only one variable can be set') + if img_scale is not None: + self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] + self.scale_key = 'scale' + assert mmcv.is_list_of(self.img_scale, tuple) + else: + self.img_scale = scale_factor if isinstance(scale_factor, list) else [scale_factor] + self.scale_key = 'scale_factor' + + self.flip = flip + self.flip_direction = flip_direction if isinstance(flip_direction, list) else [flip_direction] + assert mmcv.is_list_of(self.flip_direction, str) + if not self.flip and self.flip_direction != ['horizontal']: + warnings.warn( + 'flip_direction has no effect when flip is set to False') + if (self.flip + and not any([t['type'] == 'RandomFlip' for t in transforms])): + warnings.warn( + 'flip has no effect when RandomFlip is not in transforms') + + def __call__(self, results): + """Call function to apply test time augment transforms on results. + + Args: + results (dict): Result dict contains the data to transform. + + Returns: + dict[str: list]: The augmented data, where each value is wrapped + into a list. + """ + + aug_data = [] + flip_args = [(False, None)] + if self.flip: + flip_args += [(True, direction) + for direction in self.flip_direction] + for scale in self.img_scale: + for flip, direction in flip_args: + _results = results.copy() + _results[self.scale_key] = scale + _results['flip'] = flip + _results['flip_direction'] = direction + data = self.transforms(_results) + aug_data.append(data) + # list of dict to dict of list + aug_data_dict = {key: [] for key in aug_data[0]} + for data in aug_data: + for key, val in data.items(): + aug_data_dict[key].append(val) + return aug_data_dict + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(transforms={self.transforms}, ' + repr_str += f'img_scale={self.img_scale}, flip={self.flip}, ' + repr_str += f'flip_direction={self.flip_direction})' + return repr_str + + +@PIPELINES.register_module() +class Resize: + """Resize images & bbox & mask. + + This transform resizes the input image to some scale. Bboxes and masks are + then resized with the same scale factor. If the input dict contains the key + "scale", then the scale in the input dict is used, otherwise the specified + scale in the init method is used. If the input dict contains the key + "scale_factor" (if MultiScaleFlipAug does not give img_scale but + scale_factor), the actual scale will be computed by image shape and + scale_factor. + + `img_scale` can either be a tuple (single-scale) or a list of tuple + (multi-scale). There are 3 multiscale modes: + + - ``ratio_range is not None``: randomly sample a ratio from the ratio \ + range and multiply it with the image scale. + - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \ + sample a scale from the multiscale range. + - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \ + sample a scale from multiple scales. + + Args: + img_scale (tuple or list[tuple]): Images scales for resizing. + multiscale_mode (str): Either "range" or "value". + ratio_range (tuple[float]): (min_ratio, max_ratio) + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + backend (str): Image resize backend, choices are 'cv2' and 'pillow'. + These two backends generates slightly different results. Defaults + to 'cv2'. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. + override (bool, optional): Whether to override `scale` and + `scale_factor` so as to call resize twice. Default False. If True, + after the first resizing, the existed `scale` and `scale_factor` + will be ignored so the second resizing can be allowed. + This option is a work-around for multiple times of resize in DETR. + Defaults to False. + """ + + def __init__(self, img_scale=None, multiscale_mode='range', ratio_range=None, + keep_ratio=True): + bbox_clip_border = True + backend = 'cv2' + interpolation = 'bilinear' + override = False + if img_scale is None: + self.img_scale = None + else: + if isinstance(img_scale, list): + self.img_scale = img_scale + else: + self.img_scale = [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) + + if ratio_range is not None: + # mode 1: given a scale and a range of image ratio + assert len(self.img_scale) == 1 + else: + # mode 2: given multiple scales or a range of scales + assert multiscale_mode in ['value', 'range'] + + self.backend = backend + self.multiscale_mode = multiscale_mode + self.ratio_range = ratio_range + self.keep_ratio = keep_ratio + self.interpolation = interpolation + self.override = override + self.bbox_clip_border = bbox_clip_border + + def __call__(self, results): + """Call function to resize images, bounding boxes, masks, semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \ + 'keep_ratio' keys are added into result dict. + """ + + if 'scale' not in results: + if 'scale_factor' in results: + img_shape = results['img'].shape[:2] + scale_factor = results['scale_factor'] + assert isinstance(scale_factor, float) + results['scale'] = tuple( + [int(x * scale_factor) for x in img_shape][::-1]) + else: + self._random_scale(results) + else: + if not self.override: + assert 'scale_factor' not in results, ( + 'scale and scale_factor cannot be both set.') + else: + results.pop('scale') + if 'scale_factor' in results: + results.pop('scale_factor') + self._random_scale(results) + + self._resize_img(results) + self._resize_bboxes(results) + self._resize_masks(results) + self._resize_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'multiscale_mode={self.multiscale_mode}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'keep_ratio={self.keep_ratio}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + @staticmethod + def random_select(img_scales): + """Randomly select an img_scale from given candidates. + + Args: + img_scales (list[tuple]): Images scales for selection. + + Returns: + (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \ + where ``img_scale`` is the selected image scale and \ + ``scale_idx`` is the selected index in the given candidates. + """ + + assert mmcv.is_list_of(img_scales, tuple) + scale_idx = np.random.randint(len(img_scales)) + img_scale = img_scales[scale_idx] + return img_scale, scale_idx + + @staticmethod + def random_sample(img_scales): + """Randomly sample an img_scale when ``multiscale_mode=='range'``. + + Args: + img_scales (list[tuple]): Images scale range for sampling. + There must be two tuples in img_scales, which specify the lower + and upper bound of image scales. + + Returns: + (tuple, None): Returns a tuple ``(img_scale, None)``, where \ + ``img_scale`` is sampled scale and None is just a placeholder \ + to be consistent with :func:`random_select`. + """ + + assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 + img_scale_long = [max(s) for s in img_scales] + img_scale_short = [min(s) for s in img_scales] + long_edge = np.random.randint( + min(img_scale_long), + max(img_scale_long) + 1) + short_edge = np.random.randint( + min(img_scale_short), + max(img_scale_short) + 1) + img_scale = (long_edge, short_edge) + return img_scale, None + + @staticmethod + def random_sample_ratio(img_scale, ratio_range): + """Randomly sample an img_scale when ``ratio_range`` is specified. + + A ratio will be randomly sampled from the range specified by + ``ratio_range``. Then it would be multiplied with ``img_scale`` to + generate sampled scale. + + Args: + img_scale (tuple): Images scale base to multiply with ratio. + ratio_range (tuple[float]): The minimum and maximum ratio to scale + the ``img_scale``. + + Returns: + (tuple, None): Returns a tuple ``(scale, None)``, where \ + ``scale`` is sampled ratio multiplied with ``img_scale`` and \ + None is just a placeholder to be consistent with \ + :func:`random_select`. + """ + + assert isinstance(img_scale, tuple) and len(img_scale) == 2 + min_ratio, max_ratio = ratio_range + assert min_ratio <= max_ratio + ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio + scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) + return scale, None + + def _random_scale(self, results): + """Randomly sample an img_scale according to ``ratio_range`` and + ``multiscale_mode``. + + If ``ratio_range`` is specified, a ratio will be sampled and be + multiplied with ``img_scale``. + If multiple scales are specified by ``img_scale``, a scale will be + sampled according to ``multiscale_mode``. + Otherwise, single scale will be used. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: Two new keys 'scale` and 'scale_idx` are added into \ + ``results``, which would be used by subsequent pipelines. + """ + + if self.ratio_range is not None: + scale, scale_idx = self.random_sample_ratio( + self.img_scale[0], self.ratio_range) + elif len(self.img_scale) == 1: + scale, scale_idx = self.img_scale[0], 0 + elif self.multiscale_mode == 'range': + scale, scale_idx = self.random_sample(self.img_scale) + elif self.multiscale_mode == 'value': + scale, scale_idx = self.random_select(self.img_scale) + else: + raise NotImplementedError + + results['scale'] = scale + results['scale_idx'] = scale_idx + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + for key in results.get('img_fields', ['img']): + if self.keep_ratio: + img, scale_factor = mmcv.imrescale( + results[key], + results['scale'], + return_scale=True, + interpolation=self.interpolation, + backend=self.backend) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + new_h, new_w = img.shape[:2] + h, w = results[key].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results[key], + results['scale'], + return_scale=True, + interpolation=self.interpolation, + backend=self.backend) + results[key] = img + + scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], + dtype=np.float32) + results['img_shape'] = img.shape + # in case that there is no padding + results['pad_shape'] = img.shape + results['scale_factor'] = scale_factor + results['keep_ratio'] = self.keep_ratio + + def _resize_bboxes(self, results): + """Resize bounding boxes with ``results['scale_factor']``.""" + for key in results.get('bbox_fields', []): + bboxes = results[key] * results['scale_factor'] + if self.bbox_clip_border: + img_shape = results['img_shape'] + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) + results[key] = bboxes + + def _resize_masks(self, results): + """Resize masks with ``results['scale']``""" + for key in results.get('mask_fields', []): + if results[key] is None: + continue + if self.keep_ratio: + results[key] = results[key].rescale(results['scale']) + else: + results[key] = results[key].resize(results['img_shape'][:2]) + + def _resize_seg(self, results): + """Resize semantic segmentation map with ``results['scale']``.""" + for key in results.get('seg_fields', []): + if self.keep_ratio: + gt_seg = mmcv.imrescale( + results[key], + results['scale'], + interpolation='nearest', + backend=self.backend) + else: + gt_seg = mmcv.imresize( + results[key], + results['scale'], + interpolation='nearest', + backend=self.backend) + results[key] = gt_seg + + +@PIPELINES.register_module() +class RandomFlip: + """Flip the image & bbox & mask. + + If the input dict contains the key "flip", then the flag will be used, + otherwise it will be randomly decided by a ratio specified in the init + method. + + When random flip is enabled, ``flip_ratio``/``direction`` can either be a + float/string or tuple of float/string. There are 3 flip modes: + + - ``flip_ratio`` is float, ``direction`` is string: the image will be + ``direction``ly flipped with probability of ``flip_ratio`` . + E.g., ``flip_ratio=0.5``, ``direction='horizontal'``, + then image will be horizontally flipped with probability of 0.5. + - ``flip_ratio`` is float, ``direction`` is list of string: the image will + be ``direction[i]``ly flipped with probability of + ``flip_ratio/len(direction)``. + E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``, + then image will be horizontally flipped with probability of 0.25, + vertically with probability of 0.25. + - ``flip_ratio`` is list of float, ``direction`` is list of string: + given ``len(flip_ratio) == len(direction)``, the image will + be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``. + E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal', + 'vertical']``, then image will be horizontally flipped with probability + of 0.3, vertically with probability of 0.5. + + Args: + flip_ratio (float | list[float], optional): The flipping probability. + Default: None. + direction(str | list[str], optional): The flipping direction. Options + are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'. + If input is a list, the length must equal ``flip_ratio``. Each + element in ``flip_ratio`` indicates the flip probability of + corresponding direction. + """ + + def __init__(self, flip_ratio=None, direction='horizontal'): + if isinstance(flip_ratio, list): + assert mmcv.is_list_of(flip_ratio, float) + assert 0 <= sum(flip_ratio) <= 1 + elif isinstance(flip_ratio, float): + assert 0 <= flip_ratio <= 1 + elif flip_ratio is None: + pass + else: + raise ValueError('flip_ratios must be None, float, ' + 'or list of float') + self.flip_ratio = flip_ratio + + valid_directions = ['horizontal', 'vertical', 'diagonal'] + if isinstance(direction, str): + assert direction in valid_directions + elif isinstance(direction, list): + assert mmcv.is_list_of(direction, str) + assert set(direction).issubset(set(valid_directions)) + else: + raise ValueError('direction must be either str or list of str') + self.direction = direction + + if isinstance(flip_ratio, list): + assert len(self.flip_ratio) == len(self.direction) + + def __call__(self, results): + """Call function to flip bounding boxes, masks, semantic segmentation + maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Flipped results, 'flip', 'flip_direction' keys are added \ + into result dict. + """ + + if 'flip' not in results: + if isinstance(self.direction, list): + # None means non-flip + direction_list = self.direction + [None] + else: + # None means non-flip + direction_list = [self.direction, None] + + if isinstance(self.flip_ratio, list): + non_flip_ratio = 1 - sum(self.flip_ratio) + flip_ratio_list = self.flip_ratio + [non_flip_ratio] + else: + non_flip_ratio = 1 - self.flip_ratio + # exclude non-flip + single_ratio = self.flip_ratio / (len(direction_list) - 1) + flip_ratio_list = [single_ratio] * (len(direction_list) - + 1) + [non_flip_ratio] + + cur_dir = np.random.choice(direction_list, p=flip_ratio_list) + + results['flip'] = cur_dir is not None + if 'flip_direction' not in results: + results['flip_direction'] = cur_dir + if results['flip']: + # flip image + for key in results.get('img_fields', ['img']): + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']) + # flip bboxes + for key in results.get('bbox_fields', []): + results[key] = self.bbox_flip(results[key], + results['img_shape'], + results['flip_direction']) + # flip masks + for key in results.get('mask_fields', []): + results[key] = results[key].flip(results['flip_direction']) + + # flip segs + for key in results.get('seg_fields', []): + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})' + + @staticmethod + def bbox_flip(bboxes, img_shape, direction): + assert bboxes.shape[-1] % 4 == 0 + flipped = bboxes.copy() + if direction == 'horizontal': + w = img_shape[1] + flipped[..., 0::4] = w - bboxes[..., 2::4] + flipped[..., 2::4] = w - bboxes[..., 0::4] + elif direction == 'vertical': + h = img_shape[0] + flipped[..., 1::4] = h - bboxes[..., 3::4] + flipped[..., 3::4] = h - bboxes[..., 1::4] + elif direction == 'diagonal': + w = img_shape[1] + h = img_shape[0] + flipped[..., 0::4] = w - bboxes[..., 2::4] + flipped[..., 1::4] = h - bboxes[..., 3::4] + flipped[..., 2::4] = w - bboxes[..., 0::4] + flipped[..., 3::4] = h - bboxes[..., 1::4] + else: + raise ValueError(f"Invalid flipping direction '{direction}'") + return flipped + + +@PIPELINES.register_module() +class Pad: + """Pad the image & masks & segmentation map. + + There are two padding modes: (1) pad to a fixed size and (2) pad to the + minimum size that is divisible by some number. + Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", + + Args: + size (tuple, optional): Fixed padding size. + size_divisor (int, optional): The divisor of padded size. + pad_to_square (bool): Whether to pad the image into a square. + Currently only used for YOLOX. Default: False. + pad_val (dict, optional): A dict for padding value, the default + value is `dict(img=0, masks=0, seg=255)`. + """ + + def __init__(self, size=None, size_divisor=None, pad_to_square=False, pad_val=None): + self.size = size + self.size_divisor = size_divisor + pad_val = pad_val or dict(img=0, masks=0, seg=255) + if isinstance(pad_val, float) or isinstance(pad_val, int): + warnings.warn( + 'pad_val of float type is deprecated now, ' + f'please use pad_val=dict(img={pad_val}, ' + f'masks={pad_val}, seg=255) instead.', DeprecationWarning) + pad_val = dict(img=pad_val, masks=pad_val, seg=255) + assert isinstance(pad_val, dict) + self.pad_val = pad_val + self.pad_to_square = pad_to_square + + if pad_to_square: + assert size is None and size_divisor is None, \ + 'The size and size_divisor must be None ' \ + 'when pad2square is True' + else: + assert size is not None or size_divisor is not None, \ + 'only one of size and size_divisor should be valid' + assert size is None or size_divisor is None + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Updated result dict. + """ + self._pad_img(results) + self._pad_masks(results) + self._pad_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.size}, ' + repr_str += f'size_divisor={self.size_divisor}, ' + repr_str += f'pad_to_square={self.pad_to_square}, ' + repr_str += f'pad_val={self.pad_val})' + return repr_str + + def _pad_img(self, results): + """Pad images according to ``self.size``.""" + pad_val = self.pad_val.get('img', 0) + for key in results.get('img_fields', ['img']): + if self.pad_to_square: + max_size = max(results[key].shape[:2]) + self.size = (max_size, max_size) + if self.size is not None: + padded_img = mmcv.impad( + results[key], shape=self.size, pad_val=pad_val) + elif self.size_divisor is not None: + padded_img = mmcv.impad_to_multiple( + results[key], self.size_divisor, pad_val=pad_val) + results[key] = padded_img + results['pad_shape'] = padded_img.shape + results['pad_fixed_size'] = self.size + results['pad_size_divisor'] = self.size_divisor + + def _pad_masks(self, results): + """Pad masks according to ``results['pad_shape']``.""" + pad_shape = results['pad_shape'][:2] + pad_val = self.pad_val.get('masks', 0) + for key in results.get('mask_fields', []): + results[key] = results[key].pad(pad_shape, pad_val=pad_val) + + def _pad_seg(self, results): + """Pad semantic segmentation map according to + ``results['pad_shape']``.""" + pad_val = self.pad_val.get('seg', 255) + for key in results.get('seg_fields', []): + results[key] = mmcv.impad( + results[key], shape=results['pad_shape'][:2], pad_val=pad_val) + + +@PIPELINES.register_module() +class Normalize: + """Normalize the image. + + Added key is "img_norm_cfg". + + Args: + mean (sequence): Mean values of 3 channels. + std (sequence): Std values of 3 channels. + to_rgb (bool): Whether to convert the image from BGR to RGB, + default is true. + """ + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + def __call__(self, results): + """Call function to normalize images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Normalized results, 'img_norm_cfg' key is added into + result dict. + """ + for key in results.get('img_fields', ['img']): + results[key] = mmcv.imnormalize(results[key], self.mean, self.std, + self.to_rgb) + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' + return repr_str + + +@PIPELINES.register_module() +class ImageToTensor: + """Convert image to :obj:`Tensor` by given keys. + + The dimension order of input image is (H, W, C). The pipeline will convert + it to (C, H, W). If only 2 dimension (H, W) is given, the output would be + (1, H, W). + + Args: + keys (Sequence[str]): Key of images to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert image in results to :obj:`Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`Tensor` and transposed to (C, H, W) order. + """ + for key in self.keys: + img = results[key] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = img.transpose(2, 0, 1) # HWC-> CHW + img = np.ascontiguousarray(img) + img = to_tensor(img) + results[key] = img + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PIPELINES.register_module() +class HWCToCHW: + """Convert image to :obj:`Tensor` by given keys. + + The dimension order of input image is (H, W, C). The pipeline will convert + it to (C, H, W). If only 2 dimension (H, W) is given, the output would be + (1, H, W). + + Args: + keys (Sequence[str]): Key of images to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert image in results to :obj:`Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`Tensor` and transposed to (C, H, W) order. + """ + for key in self.keys: + img = results[key] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = img.transpose(2, 0, 1) # HWC-> CHW + img = np.ascontiguousarray(img) + results[key] = img + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +def to_tensor(data): + """Convert objects of various python types to :obj:`Tensor`. + + Supported types are: :class:`numpy.ndarray`, :class:`Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + + Args: + data (Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + # return Tensor(data) mindspore Tensor + raise NotImplementedError + + +@PIPELINES.register_module() +class Collect: + """Collect data from the loader relevant to the specific task. + + This is usually the last stage of the data loader pipeline. Typically keys + is set to some subset of "img", "proposals", "gt_bboxes", + "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". + + The "img_meta" item is always populated. The contents of the "img_meta" + dictionary depends on "meta_keys". By default this includes: + + - "img_shape": shape of the image input to the network as a tuple \ + (h, w, c). Note that images may be zero padded on the \ + bottom/right if the batch tensor is larger than this shape. + + - "scale_factor": a float indicating the preprocessing scale + + - "flip": a boolean indicating if image flip transform was used + + - "filename": path to the image file + + - "ori_shape": original shape of the image as a tuple (h, w, c) + + - "pad_shape": image shape after padding + + - "img_norm_cfg": a dict of normalization information: + + - mean - per channel mean subtraction + - std - per channel std divisor + - to_rgb - bool indicating if bgr was converted to rgb + + Args: + keys (Sequence[str]): Keys of results to be collected in ``data``. + meta_keys (Sequence[str], optional): Meta keys to be converted to + ``mmcv.DataContainer`` and collected in ``data[img_metas]``. + Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape', + 'pad_shape', 'scale_factor', 'flip', 'flip_direction', + 'img_norm_cfg')`` + """ + + def __init__(self, + keys, + meta_keys=('filename', 'ori_filename', 'ori_shape', + 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction', 'img_norm_cfg')): + self.keys = keys + self.meta_keys = meta_keys + + def __call__(self, results): + """Call function to collect keys in results. The keys in ``meta_keys`` + will be converted to :obj:mmcv.DataContainer. + + Args: + results (dict): Result dict contains the data to collect. + + Returns: + dict: The result dict contains the following keys + + - keys in``self.keys`` + - ``img_metas`` + """ + + data = {} + img_meta = {} + for key in self.meta_keys: + img_meta[key] = results[key] + data['img_metas'] = img_meta + for key in self.keys: + data[key] = results[key] + return data + + def __repr__(self): + return self.__class__.__name__ + \ + f'(keys={self.keys}, meta_keys={self.meta_keys})' + + +def build_processor(test_pipelines): + return Compose(test_pipelines) diff --git a/contrib/Overlap-Recovery/inference/test.jpg b/contrib/Overlap-Recovery/inference/test.jpg new file mode 100644 index 0000000000000000000000000000000000000000..01251aade0065872fcd8d7c374301db18b176846 Binary files /dev/null and b/contrib/Overlap-Recovery/inference/test.jpg differ diff --git a/contrib/Overlap-Recovery/inference/test/0.png b/contrib/Overlap-Recovery/inference/test/0.png new file mode 100644 index 0000000000000000000000000000000000000000..2de2d1e7760222f308b326d9f1c3cf49648125db Binary files /dev/null and b/contrib/Overlap-Recovery/inference/test/0.png differ diff --git a/contrib/Overlap-Recovery/inference/test/1.png b/contrib/Overlap-Recovery/inference/test/1.png new file mode 100644 index 0000000000000000000000000000000000000000..d4b178a02bb51bc48a3ed7fee7c1a461ead13745 Binary files /dev/null and b/contrib/Overlap-Recovery/inference/test/1.png differ diff --git a/contrib/Overlap-Recovery/inference/test/input.jpg b/contrib/Overlap-Recovery/inference/test/input.jpg new file mode 100644 index 0000000000000000000000000000000000000000..01251aade0065872fcd8d7c374301db18b176846 Binary files /dev/null and b/contrib/Overlap-Recovery/inference/test/input.jpg differ diff --git "a/contrib/Overlap-Recovery/inference/\346\265\201\347\250\213\345\233\276.png" "b/contrib/Overlap-Recovery/inference/\346\265\201\347\250\213\345\233\276.png" new file mode 100644 index 0000000000000000000000000000000000000000..b624fb4a3f6f51fd6ac2fd9633811d5e25eca5cd Binary files /dev/null and "b/contrib/Overlap-Recovery/inference/\346\265\201\347\250\213\345\233\276.png" differ diff --git "a/contrib/Overlap-Recovery/inference/\346\265\213\350\257\225\347\273\223\346\236\234.png" "b/contrib/Overlap-Recovery/inference/\346\265\213\350\257\225\347\273\223\346\236\234.png" new file mode 100644 index 0000000000000000000000000000000000000000..b059731bb125417eb0f75d99f8f44e659dd30c4c Binary files /dev/null and "b/contrib/Overlap-Recovery/inference/\346\265\213\350\257\225\347\273\223\346\236\234.png" differ diff --git a/contrib/Overlap-Recovery/train/.gitignore b/contrib/Overlap-Recovery/train/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6f57c0b0be61668fa3c5afa98a1ae7ce6482c28c --- /dev/null +++ b/contrib/Overlap-Recovery/train/.gitignore @@ -0,0 +1,144 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +.idea +.DS_Store +ominfer_testcase.py \ No newline at end of file diff --git a/contrib/Overlap-Recovery/train/.gitkeep b/contrib/Overlap-Recovery/train/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/Overlap-Recovery/train/__init__.py b/contrib/Overlap-Recovery/train/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4f96c15807d6ba6a00c589cc2181e8677d1c44dd --- /dev/null +++ b/contrib/Overlap-Recovery/train/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/contrib/Overlap-Recovery/train/eval.py b/contrib/Overlap-Recovery/train/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..40c99e553f62ced51087fd2a22f45ed522fa1e81 --- /dev/null +++ b/contrib/Overlap-Recovery/train/eval.py @@ -0,0 +1,97 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Evaluation for De-Occluder""" +import os +import time +import numpy as np +from loguru import logger +from tqdm import tqdm + +from src.model_utils.configs.config_base import config +from src.model_utils.device_adapter import get_device_id +from src.deoccluder import CustomKNet +from src.dataset import build_dataset + +import mindspore as ms +from mindspore import context +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.common import set_seed +from mindspore import dataset as de + + +set_seed(1) +config.train = False + + +def eval_func(eval_set, ckpt_path, src_eval_set): + """MaskRcnn evaluation.""" + net = CustomKNet(config.model) + param_dict = load_checkpoint(ckpt_path) + load_param_into_net(net, param_dict, strict_load=False) + non_match_keys = [] + matched_keys = [] + for _, param in net.parameters_and_names(): + if param.name in param_dict: + matched_keys.append(param.name) + else: + non_match_keys.append(param.name) + net.set_train(False) + + eval_iter = 0 + total = eval_set.get_dataset_size() + + logger.info("\n========================================\n") + logger.info("total images num: ", total) + logger.info("Processing, please wait a moment.") + results = [] + for data in tqdm(eval_set.create_dict_iterator(output_numpy=True, num_epochs=1), total=total): + eval_iter = eval_iter + 1 + for key in data.keys(): + data[key] = ms.Tensor(data[key]) + # run net + output = net(**data) + results.append(output[0]) + logger.info(src_eval_set.evaluate(results)) + + +def eval_(): + device_target = config.device_target + context.set_context(mode=context.PYNATIVE_MODE, device_target=device_target, device_id=get_device_id()) + + logger.info("Start create eval dataset!") + + # It will generate mindrecord file in config.mindrecord_dir + if not os.path.exists(config.mindrecord_dir): + os.makedirs(config.mindrecord_dir) + logger.add(os.path.join(config.mindrecord_dir, time.asctime(time.localtime()).replace(' ', '_') + ".log")) + + # prepare dataset + eval_set_cls = build_dataset(config.data['test']) + collect_pipe = config.data['test']['pipeline'][-1] + column_names = list(collect_pipe['keys']) + list(collect_pipe['meta_keys']) + eval_set = de.GeneratorDataset(eval_set_cls, + column_names=column_names, + num_parallel_workers=1, + shuffle=False) + eval_set = eval_set.batch(1, drop_remainder=False) + + logger.info("Start Eval!") + logger.info(f"ckpt_path = {config.checkpoint_path}") + eval_func(eval_set, config.checkpoint_path, eval_set_cls) + + +if __name__ == '__main__': + eval_() diff --git a/contrib/Overlap-Recovery/train/export.py b/contrib/Overlap-Recovery/train/export.py new file mode 100644 index 0000000000000000000000000000000000000000..7dadeade69df6a63098ef21d591804211fbd2ad7 --- /dev/null +++ b/contrib/Overlap-Recovery/train/export.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" export model to 'AIR', 'ONNX' and 'MINDIR' """ + +import numpy as np +import mindspore as ms +from mindspore import Tensor, context, load_checkpoint, export, load_param_into_net +from src.deoccluder import CustomKNet +from src.model_utils.configs.config_base import config +from src.model_utils.device_adapter import get_device_id + +context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU", device_id= get_device_id()) + + +def best_model_export(): + ckpt_file_path = './models/best_iou.ckpt' + file_name = 'best_iou.onnx' + config.data['samples_per_gpu'] = 1 + + net = CustomKNet(config.model) + load_checkpoint(ckpt_file_path, net=net) + net.set_train(False) + net.is_model_export = True + + input_data = Tensor(np.zeros([1, 3, 768, 768]), ms.float32) + export(net, input_data, file_name=file_name, file_format='ONNX') # 'AIR', 'ONNX' and 'MINDIR' + print(f'save to {file_name}') + +if __name__ == '__main__': + best_model_export() diff --git a/contrib/Overlap-Recovery/train/models/.gitkeep b/contrib/Overlap-Recovery/train/models/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/Overlap-Recovery/train/resource_utils/resnet50_dict.json b/contrib/Overlap-Recovery/train/resource_utils/resnet50_dict.json new file mode 100644 index 0000000000000000000000000000000000000000..2561cf7ef3cabaca01cf97369992c2944882a4e6 --- /dev/null +++ b/contrib/Overlap-Recovery/train/resource_utils/resnet50_dict.json @@ -0,0 +1 @@ +{"conv1.weight": "conv1.weight", "bn1.running_mean": "bn1.moving_mean", "bn1.running_var": "bn1.moving_variance", "bn1.weight": "bn1.gamma", "bn1.bias": "bn1.beta", "layer1.0.conv1.weight": "layer1.0.conv1.weight", "layer1.0.bn1.running_mean": "layer1.0.bn1.moving_mean", "layer1.0.bn1.running_var": "layer1.0.bn1.moving_variance", "layer1.0.bn1.weight": "layer1.0.bn1.gamma", "layer1.0.bn1.bias": "layer1.0.bn1.beta", "layer1.0.conv2.weight": "layer1.0.conv2.weight", "layer1.0.bn2.running_mean": "layer1.0.bn2.moving_mean", "layer1.0.bn2.running_var": "layer1.0.bn2.moving_variance", "layer1.0.bn2.weight": "layer1.0.bn2.gamma", "layer1.0.bn2.bias": "layer1.0.bn2.beta", "layer1.0.conv3.weight": "layer1.0.conv3.weight", "layer1.0.bn3.running_mean": "layer1.0.bn3.moving_mean", "layer1.0.bn3.running_var": "layer1.0.bn3.moving_variance", "layer1.0.bn3.weight": "layer1.0.bn3.gamma", "layer1.0.bn3.bias": "layer1.0.bn3.beta", "layer1.0.downsample.0.weight": "layer1.0.downsample.0.weight", "layer1.0.downsample.1.running_mean": "layer1.0.downsample.1.moving_mean", "layer1.0.downsample.1.running_var": "layer1.0.downsample.1.moving_variance", "layer1.0.downsample.1.weight": "layer1.0.downsample.1.gamma", "layer1.0.downsample.1.bias": "layer1.0.downsample.1.beta", "layer1.1.conv1.weight": "layer1.1.conv1.weight", "layer1.1.bn1.running_mean": "layer1.1.bn1.moving_mean", "layer1.1.bn1.running_var": "layer1.1.bn1.moving_variance", "layer1.1.bn1.weight": "layer1.1.bn1.gamma", "layer1.1.bn1.bias": "layer1.1.bn1.beta", "layer1.1.conv2.weight": "layer1.1.conv2.weight", "layer1.1.bn2.running_mean": "layer1.1.bn2.moving_mean", "layer1.1.bn2.running_var": "layer1.1.bn2.moving_variance", "layer1.1.bn2.weight": "layer1.1.bn2.gamma", "layer1.1.bn2.bias": "layer1.1.bn2.beta", "layer1.1.conv3.weight": "layer1.1.conv3.weight", "layer1.1.bn3.running_mean": "layer1.1.bn3.moving_mean", "layer1.1.bn3.running_var": "layer1.1.bn3.moving_variance", "layer1.1.bn3.weight": "layer1.1.bn3.gamma", "layer1.1.bn3.bias": "layer1.1.bn3.beta", "layer1.2.conv1.weight": "layer1.2.conv1.weight", "layer1.2.bn1.running_mean": "layer1.2.bn1.moving_mean", "layer1.2.bn1.running_var": "layer1.2.bn1.moving_variance", "layer1.2.bn1.weight": "layer1.2.bn1.gamma", "layer1.2.bn1.bias": "layer1.2.bn1.beta", "layer1.2.conv2.weight": "layer1.2.conv2.weight", "layer1.2.bn2.running_mean": "layer1.2.bn2.moving_mean", "layer1.2.bn2.running_var": "layer1.2.bn2.moving_variance", "layer1.2.bn2.weight": "layer1.2.bn2.gamma", "layer1.2.bn2.bias": "layer1.2.bn2.beta", "layer1.2.conv3.weight": "layer1.2.conv3.weight", "layer1.2.bn3.running_mean": "layer1.2.bn3.moving_mean", "layer1.2.bn3.running_var": "layer1.2.bn3.moving_variance", "layer1.2.bn3.weight": "layer1.2.bn3.gamma", "layer1.2.bn3.bias": "layer1.2.bn3.beta", "layer2.0.conv1.weight": "layer2.0.conv1.weight", "layer2.0.bn1.running_mean": "layer2.0.bn1.moving_mean", "layer2.0.bn1.running_var": "layer2.0.bn1.moving_variance", "layer2.0.bn1.weight": "layer2.0.bn1.gamma", "layer2.0.bn1.bias": "layer2.0.bn1.beta", "layer2.0.conv2.weight": "layer2.0.conv2.weight", "layer2.0.bn2.running_mean": "layer2.0.bn2.moving_mean", "layer2.0.bn2.running_var": "layer2.0.bn2.moving_variance", "layer2.0.bn2.weight": "layer2.0.bn2.gamma", "layer2.0.bn2.bias": "layer2.0.bn2.beta", "layer2.0.conv3.weight": "layer2.0.conv3.weight", "layer2.0.bn3.running_mean": "layer2.0.bn3.moving_mean", "layer2.0.bn3.running_var": "layer2.0.bn3.moving_variance", "layer2.0.bn3.weight": "layer2.0.bn3.gamma", "layer2.0.bn3.bias": "layer2.0.bn3.beta", "layer2.0.downsample.0.weight": "layer2.0.downsample.0.weight", "layer2.0.downsample.1.running_mean": "layer2.0.downsample.1.moving_mean", "layer2.0.downsample.1.running_var": "layer2.0.downsample.1.moving_variance", "layer2.0.downsample.1.weight": "layer2.0.downsample.1.gamma", "layer2.0.downsample.1.bias": "layer2.0.downsample.1.beta", "layer2.1.conv1.weight": "layer2.1.conv1.weight", "layer2.1.bn1.running_mean": "layer2.1.bn1.moving_mean", "layer2.1.bn1.running_var": "layer2.1.bn1.moving_variance", "layer2.1.bn1.weight": "layer2.1.bn1.gamma", "layer2.1.bn1.bias": "layer2.1.bn1.beta", "layer2.1.conv2.weight": "layer2.1.conv2.weight", "layer2.1.bn2.running_mean": "layer2.1.bn2.moving_mean", "layer2.1.bn2.running_var": "layer2.1.bn2.moving_variance", "layer2.1.bn2.weight": "layer2.1.bn2.gamma", "layer2.1.bn2.bias": "layer2.1.bn2.beta", "layer2.1.conv3.weight": "layer2.1.conv3.weight", "layer2.1.bn3.running_mean": "layer2.1.bn3.moving_mean", "layer2.1.bn3.running_var": "layer2.1.bn3.moving_variance", "layer2.1.bn3.weight": "layer2.1.bn3.gamma", "layer2.1.bn3.bias": "layer2.1.bn3.beta", "layer2.2.conv1.weight": "layer2.2.conv1.weight", "layer2.2.bn1.running_mean": "layer2.2.bn1.moving_mean", "layer2.2.bn1.running_var": "layer2.2.bn1.moving_variance", "layer2.2.bn1.weight": "layer2.2.bn1.gamma", "layer2.2.bn1.bias": "layer2.2.bn1.beta", "layer2.2.conv2.weight": "layer2.2.conv2.weight", "layer2.2.bn2.running_mean": "layer2.2.bn2.moving_mean", "layer2.2.bn2.running_var": "layer2.2.bn2.moving_variance", "layer2.2.bn2.weight": "layer2.2.bn2.gamma", "layer2.2.bn2.bias": "layer2.2.bn2.beta", "layer2.2.conv3.weight": "layer2.2.conv3.weight", "layer2.2.bn3.running_mean": "layer2.2.bn3.moving_mean", "layer2.2.bn3.running_var": "layer2.2.bn3.moving_variance", "layer2.2.bn3.weight": "layer2.2.bn3.gamma", "layer2.2.bn3.bias": "layer2.2.bn3.beta", "layer2.3.conv1.weight": "layer2.3.conv1.weight", "layer2.3.bn1.running_mean": "layer2.3.bn1.moving_mean", "layer2.3.bn1.running_var": "layer2.3.bn1.moving_variance", "layer2.3.bn1.weight": "layer2.3.bn1.gamma", "layer2.3.bn1.bias": "layer2.3.bn1.beta", "layer2.3.conv2.weight": "layer2.3.conv2.weight", "layer2.3.bn2.running_mean": "layer2.3.bn2.moving_mean", "layer2.3.bn2.running_var": "layer2.3.bn2.moving_variance", "layer2.3.bn2.weight": "layer2.3.bn2.gamma", "layer2.3.bn2.bias": "layer2.3.bn2.beta", "layer2.3.conv3.weight": "layer2.3.conv3.weight", "layer2.3.bn3.running_mean": "layer2.3.bn3.moving_mean", "layer2.3.bn3.running_var": "layer2.3.bn3.moving_variance", "layer2.3.bn3.weight": "layer2.3.bn3.gamma", "layer2.3.bn3.bias": "layer2.3.bn3.beta", "layer3.0.conv1.weight": "layer3.0.conv1.weight", "layer3.0.bn1.running_mean": "layer3.0.bn1.moving_mean", "layer3.0.bn1.running_var": "layer3.0.bn1.moving_variance", "layer3.0.bn1.weight": "layer3.0.bn1.gamma", "layer3.0.bn1.bias": "layer3.0.bn1.beta", "layer3.0.conv2.weight": "layer3.0.conv2.weight", "layer3.0.bn2.running_mean": "layer3.0.bn2.moving_mean", "layer3.0.bn2.running_var": "layer3.0.bn2.moving_variance", "layer3.0.bn2.weight": "layer3.0.bn2.gamma", "layer3.0.bn2.bias": "layer3.0.bn2.beta", "layer3.0.conv3.weight": "layer3.0.conv3.weight", "layer3.0.bn3.running_mean": "layer3.0.bn3.moving_mean", "layer3.0.bn3.running_var": "layer3.0.bn3.moving_variance", "layer3.0.bn3.weight": "layer3.0.bn3.gamma", "layer3.0.bn3.bias": "layer3.0.bn3.beta", "layer3.0.downsample.0.weight": "layer3.0.downsample.0.weight", "layer3.0.downsample.1.running_mean": "layer3.0.downsample.1.moving_mean", "layer3.0.downsample.1.running_var": "layer3.0.downsample.1.moving_variance", "layer3.0.downsample.1.weight": "layer3.0.downsample.1.gamma", "layer3.0.downsample.1.bias": "layer3.0.downsample.1.beta", "layer3.1.conv1.weight": "layer3.1.conv1.weight", "layer3.1.bn1.running_mean": "layer3.1.bn1.moving_mean", "layer3.1.bn1.running_var": "layer3.1.bn1.moving_variance", "layer3.1.bn1.weight": "layer3.1.bn1.gamma", "layer3.1.bn1.bias": "layer3.1.bn1.beta", "layer3.1.conv2.weight": "layer3.1.conv2.weight", "layer3.1.bn2.running_mean": "layer3.1.bn2.moving_mean", "layer3.1.bn2.running_var": "layer3.1.bn2.moving_variance", "layer3.1.bn2.weight": "layer3.1.bn2.gamma", "layer3.1.bn2.bias": "layer3.1.bn2.beta", "layer3.1.conv3.weight": "layer3.1.conv3.weight", "layer3.1.bn3.running_mean": "layer3.1.bn3.moving_mean", "layer3.1.bn3.running_var": "layer3.1.bn3.moving_variance", "layer3.1.bn3.weight": "layer3.1.bn3.gamma", "layer3.1.bn3.bias": "layer3.1.bn3.beta", "layer3.2.conv1.weight": "layer3.2.conv1.weight", "layer3.2.bn1.running_mean": "layer3.2.bn1.moving_mean", "layer3.2.bn1.running_var": "layer3.2.bn1.moving_variance", "layer3.2.bn1.weight": "layer3.2.bn1.gamma", "layer3.2.bn1.bias": "layer3.2.bn1.beta", "layer3.2.conv2.weight": "layer3.2.conv2.weight", "layer3.2.bn2.running_mean": "layer3.2.bn2.moving_mean", "layer3.2.bn2.running_var": "layer3.2.bn2.moving_variance", "layer3.2.bn2.weight": "layer3.2.bn2.gamma", "layer3.2.bn2.bias": "layer3.2.bn2.beta", "layer3.2.conv3.weight": "layer3.2.conv3.weight", "layer3.2.bn3.running_mean": "layer3.2.bn3.moving_mean", "layer3.2.bn3.running_var": "layer3.2.bn3.moving_variance", "layer3.2.bn3.weight": "layer3.2.bn3.gamma", "layer3.2.bn3.bias": "layer3.2.bn3.beta", "layer3.3.conv1.weight": "layer3.3.conv1.weight", "layer3.3.bn1.running_mean": "layer3.3.bn1.moving_mean", "layer3.3.bn1.running_var": "layer3.3.bn1.moving_variance", "layer3.3.bn1.weight": "layer3.3.bn1.gamma", "layer3.3.bn1.bias": "layer3.3.bn1.beta", "layer3.3.conv2.weight": "layer3.3.conv2.weight", "layer3.3.bn2.running_mean": "layer3.3.bn2.moving_mean", "layer3.3.bn2.running_var": "layer3.3.bn2.moving_variance", "layer3.3.bn2.weight": "layer3.3.bn2.gamma", "layer3.3.bn2.bias": "layer3.3.bn2.beta", "layer3.3.conv3.weight": "layer3.3.conv3.weight", "layer3.3.bn3.running_mean": "layer3.3.bn3.moving_mean", "layer3.3.bn3.running_var": "layer3.3.bn3.moving_variance", "layer3.3.bn3.weight": "layer3.3.bn3.gamma", "layer3.3.bn3.bias": "layer3.3.bn3.beta", "layer3.4.conv1.weight": "layer3.4.conv1.weight", "layer3.4.bn1.running_mean": "layer3.4.bn1.moving_mean", "layer3.4.bn1.running_var": "layer3.4.bn1.moving_variance", "layer3.4.bn1.weight": "layer3.4.bn1.gamma", "layer3.4.bn1.bias": "layer3.4.bn1.beta", "layer3.4.conv2.weight": "layer3.4.conv2.weight", "layer3.4.bn2.running_mean": "layer3.4.bn2.moving_mean", "layer3.4.bn2.running_var": "layer3.4.bn2.moving_variance", "layer3.4.bn2.weight": "layer3.4.bn2.gamma", "layer3.4.bn2.bias": "layer3.4.bn2.beta", "layer3.4.conv3.weight": "layer3.4.conv3.weight", "layer3.4.bn3.running_mean": "layer3.4.bn3.moving_mean", "layer3.4.bn3.running_var": "layer3.4.bn3.moving_variance", "layer3.4.bn3.weight": "layer3.4.bn3.gamma", "layer3.4.bn3.bias": "layer3.4.bn3.beta", "layer3.5.conv1.weight": "layer3.5.conv1.weight", "layer3.5.bn1.running_mean": "layer3.5.bn1.moving_mean", "layer3.5.bn1.running_var": "layer3.5.bn1.moving_variance", "layer3.5.bn1.weight": "layer3.5.bn1.gamma", "layer3.5.bn1.bias": "layer3.5.bn1.beta", "layer3.5.conv2.weight": "layer3.5.conv2.weight", "layer3.5.bn2.running_mean": "layer3.5.bn2.moving_mean", "layer3.5.bn2.running_var": "layer3.5.bn2.moving_variance", "layer3.5.bn2.weight": "layer3.5.bn2.gamma", "layer3.5.bn2.bias": "layer3.5.bn2.beta", "layer3.5.conv3.weight": "layer3.5.conv3.weight", "layer3.5.bn3.running_mean": "layer3.5.bn3.moving_mean", "layer3.5.bn3.running_var": "layer3.5.bn3.moving_variance", "layer3.5.bn3.weight": "layer3.5.bn3.gamma", "layer3.5.bn3.bias": "layer3.5.bn3.beta", "layer4.0.conv1.weight": "layer4.0.conv1.weight", "layer4.0.bn1.running_mean": "layer4.0.bn1.moving_mean", "layer4.0.bn1.running_var": "layer4.0.bn1.moving_variance", "layer4.0.bn1.weight": "layer4.0.bn1.gamma", "layer4.0.bn1.bias": "layer4.0.bn1.beta", "layer4.0.conv2.weight": "layer4.0.conv2.weight", "layer4.0.bn2.running_mean": "layer4.0.bn2.moving_mean", "layer4.0.bn2.running_var": "layer4.0.bn2.moving_variance", "layer4.0.bn2.weight": "layer4.0.bn2.gamma", "layer4.0.bn2.bias": "layer4.0.bn2.beta", "layer4.0.conv3.weight": "layer4.0.conv3.weight", "layer4.0.bn3.running_mean": "layer4.0.bn3.moving_mean", "layer4.0.bn3.running_var": "layer4.0.bn3.moving_variance", "layer4.0.bn3.weight": "layer4.0.bn3.gamma", "layer4.0.bn3.bias": "layer4.0.bn3.beta", "layer4.0.downsample.0.weight": "layer4.0.downsample.0.weight", "layer4.0.downsample.1.running_mean": "layer4.0.downsample.1.moving_mean", "layer4.0.downsample.1.running_var": "layer4.0.downsample.1.moving_variance", "layer4.0.downsample.1.weight": "layer4.0.downsample.1.gamma", "layer4.0.downsample.1.bias": "layer4.0.downsample.1.beta", "layer4.1.conv1.weight": "layer4.1.conv1.weight", "layer4.1.bn1.running_mean": "layer4.1.bn1.moving_mean", "layer4.1.bn1.running_var": "layer4.1.bn1.moving_variance", "layer4.1.bn1.weight": "layer4.1.bn1.gamma", "layer4.1.bn1.bias": "layer4.1.bn1.beta", "layer4.1.conv2.weight": "layer4.1.conv2.weight", "layer4.1.bn2.running_mean": "layer4.1.bn2.moving_mean", "layer4.1.bn2.running_var": "layer4.1.bn2.moving_variance", "layer4.1.bn2.weight": "layer4.1.bn2.gamma", "layer4.1.bn2.bias": "layer4.1.bn2.beta", "layer4.1.conv3.weight": "layer4.1.conv3.weight", "layer4.1.bn3.running_mean": "layer4.1.bn3.moving_mean", "layer4.1.bn3.running_var": "layer4.1.bn3.moving_variance", "layer4.1.bn3.weight": "layer4.1.bn3.gamma", "layer4.1.bn3.bias": "layer4.1.bn3.beta", "layer4.2.conv1.weight": "layer4.2.conv1.weight", "layer4.2.bn1.running_mean": "layer4.2.bn1.moving_mean", "layer4.2.bn1.running_var": "layer4.2.bn1.moving_variance", "layer4.2.bn1.weight": "layer4.2.bn1.gamma", "layer4.2.bn1.bias": "layer4.2.bn1.beta", "layer4.2.conv2.weight": "layer4.2.conv2.weight", "layer4.2.bn2.running_mean": "layer4.2.bn2.moving_mean", "layer4.2.bn2.running_var": "layer4.2.bn2.moving_variance", "layer4.2.bn2.weight": "layer4.2.bn2.gamma", "layer4.2.bn2.bias": "layer4.2.bn2.beta", "layer4.2.conv3.weight": "layer4.2.conv3.weight", "layer4.2.bn3.running_mean": "layer4.2.bn3.moving_mean", "layer4.2.bn3.running_var": "layer4.2.bn3.moving_variance", "layer4.2.bn3.weight": "layer4.2.bn3.gamma", "layer4.2.bn3.bias": "layer4.2.bn3.beta", "fc.weight": "fc.weight", "fc.bias": "fc.bias"} \ No newline at end of file diff --git a/contrib/Overlap-Recovery/train/scripts/convert_resnet.sh b/contrib/Overlap-Recovery/train/scripts/convert_resnet.sh new file mode 100644 index 0000000000000000000000000000000000000000..ffc29c3945157070c95cf1e8c7ae6f9df9d7340b --- /dev/null +++ b/contrib/Overlap-Recovery/train/scripts/convert_resnet.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash convert_resnet.sh PTH_PATH CKPT_PATH" +echo "for example: bash convert_resnet.sh resnet50-19c8e357.pth pretrained_resnet50.ckpt" +echo "==============================================================================================================" + +PTH_PATH=$1 +CKPT_PATH=$2 +PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +DICT_FILE=$PROJECT_DIR/../resource_utils/resnet50_dict.json + +if [ $# != 2 ] +then + echo "Please specify the pth of PyTorch and ckpt of Mindspore" + echo "Please try again" + exit +fi + +LOG_DIR=$PROJECT_DIR/../logs + +python $PROJECT_DIR/../src/utils/pth2ckpt.py \ + --pth-path $PTH_PATH \ + --ckpt-path $CKPT_PATH \ + --dict-file $DICT_FILE > $LOG_DIR/convert_resnet.log 2>&1 & + +echo "The convert_resnet.log file is at /logs/convert_resnet.log" + diff --git a/contrib/Overlap-Recovery/train/scripts/train.sh b/contrib/Overlap-Recovery/train/scripts/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..3ec8c4f3861014b0b5cc94f956c76cfa1f90a956 --- /dev/null +++ b/contrib/Overlap-Recovery/train/scripts/train.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python train.py diff --git a/contrib/Overlap-Recovery/train/src/__init__.py b/contrib/Overlap-Recovery/train/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4f96c15807d6ba6a00c589cc2181e8677d1c44dd --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/contrib/Overlap-Recovery/train/src/dataset/__init__.py b/contrib/Overlap-Recovery/train/src/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ca7286c87d2fe83eff122a41ab446bbd6d590f3a --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/dataset/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from .build_dataset import build_dataset diff --git a/contrib/Overlap-Recovery/train/src/dataset/base_dataset.py b/contrib/Overlap-Recovery/train/src/dataset/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d8993d23ae042322b32d3fc3f64f022b4138ca88 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/dataset/base_dataset.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os.path as osp +import warnings + +import numpy as np +from terminaltables import AsciiTable +from .data_process import PipelineFunc + + +class CustomDataset: + """Custom dataset for detection. + + The annotation format is shown as follows. The `ann` field is optional for + testing. + + .. code-block:: none + + [ + { + 'filename': 'a.jpg', + 'width': 1280, + 'height': 720, + 'ann': { + 'bboxes': (n, 4) in (x1, y1, x2, y2) order. + 'labels': (n, ), + 'bboxes_ignore': (k, 4), (optional field) + 'labels_ignore': (k, 4) (optional field) + } + }, + ... + ] + + Args: + ann_file (str): Annotation file path. + pipeline (list[dict]): Processing pipeline. + test_mode (bool, optional): If set True, annotation will not be loaded. + """ + + custom_classes = None + + def __init__(self, ann_file, pipeline, img_prefix='', test_mode=False): + self.ann_file = ann_file + self.data_root = None + self.img_prefix = img_prefix + self.seg_prefix = img_prefix + self.seg_suffix = '.png' + self.test_mode = test_mode + self.filter_empty_gt = True + self.custom_classes = self.get_classes(None) + + # join paths if data_root is specified + if self.data_root is not None: + if not osp.isabs(self.ann_file): + self.ann_file = osp.join(self.data_root, self.ann_file) + if not (self.img_prefix is None or osp.isabs(self.img_prefix)): + self.img_prefix = osp.join(self.data_root, self.img_prefix) + if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)): + self.seg_prefix = osp.join(self.data_root, self.seg_prefix) + # load annotations (and proposals) + self.data_infos = self.load_annotations(self.ann_file) + + self.proposals = None + + # filter images too small and containing no annotations + if not test_mode: + valid_inds = self._filter_imgs() + self.data_infos = [self.data_infos[i] for i in valid_inds] + if self.proposals is not None: + self.proposals = [self.proposals[i] for i in valid_inds] + # set group flag for the sampler + self._set_group_flag() + + # processing pipeline + self.pipeline = self.build_pipeline(pipeline) + + def __len__(self): + """Total number of samples of data.""" + return len(self.data_infos) + + def __getitem__(self, idx): + """Get training/test data after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Training/test data (with annotation if `test_mode` is set \ + True). + """ + + if self.test_mode: + return self.prepare_test_img(idx) + while True: + data = self.prepare_train_img(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + + def __repr__(self): + """Print the number of instance number.""" + dataset_type = 'Test' if self.test_mode else 'Train' + result = (f'\n{self.__class__.__name__} {dataset_type} dataset ' + f'with number of images {len(self)}, ' + f'and instance counts: \n') + if self.custom_classes is None: + result += 'Category names are not provided. \n' + return result + instance_count = np.zeros(len(self.custom_classes) + 1).astype(int) + # count the instance number in each image + for idx in range(len(self)): + label = self.get_ann_info(idx)['labels'] + unique, counts = np.unique(label, return_counts=True) + if len(unique) > 0: + # add the occurrence number to each class + instance_count[unique] += counts + else: + # background is the last index + instance_count[-1] += 1 + # create a table with category count + table_data = [['category', 'count'] * 5] + row_data = [] + for cls, count in enumerate(instance_count): + if cls < len(self.custom_classes): + row_data += [f'{cls} [{self.custom_classes[cls]}]', f'{count}'] + else: + # add the background number + row_data += ['-1 background', f'{count}'] + if len(row_data) == 10: + table_data.append(row_data) + row_data = [] + if len(row_data) >= 2: + if row_data[-1] == '0': + row_data = row_data[:-2] + if len(row_data) >= 2: + table_data.append([]) + table_data.append(row_data) + + table = AsciiTable(table_data) + result += table.table + return result + + @classmethod + def get_classes(cls, classes=None): + """Get class names of current dataset. + + Args: + classes (Sequence[str] | str | None): If classes is None, use + default custom_classes defined by builtin dataset. If classes is a + string, take it as a file name. The file contains the name of + classes where each line contains one class name. If classes is + a tuple or list, override the custom_classes defined by the dataset. + + Returns: + tuple[str] or list[str]: Names of categories of the dataset. + """ + if classes is None: + return cls.custom_classes + raise NotImplementedError + + @staticmethod + def build_pipeline(pipeline): + return PipelineFunc(pipeline) + + def load_annotations(self, ann_file): + """Load annotation from annotation file.""" + print(self.ann_file, ann_file) + raise NotImplementedError + + def get_ann_info(self, idx): + """Get annotation by index. + + Args: + idx (int): Index of data. + + Returns: + dict: Annotation info of specified index. + """ + + return self.data_infos[idx]['ann'] + + def get_cat_ids(self, idx): + """Get category ids by index. + + Args: + idx (int): Index of data. + + Returns: + list[int]: All categories in the image of specified index. + """ + + return self.data_infos[idx]['ann']['labels'].astype(np.int).tolist() + + def pre_pipeline(self, results): + """Prepare results dict for pipeline.""" + results['img_prefix'] = self.img_prefix + results['seg_prefix'] = self.seg_prefix + results['proposal_file'] = None + results['bbox_fields'] = [] + results['mask_fields'] = [] + results['seg_fields'] = [] + + def prepare_train_img(self, idx): + """Get training data and annotations after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Training data and annotation after pipeline with new keys \ + introduced by pipeline. + """ + + img_info = self.data_infos[idx] + ann_info = self.get_ann_info(idx) + results = dict(img_info=img_info, ann_info=ann_info) + if self.proposals is not None: + results['proposals'] = self.proposals[idx] + self.pre_pipeline(results) + return self.pipeline(results) + + def prepare_test_img(self, idx): + """Get testing data after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Testing data after pipeline with new keys introduced by \ + pipeline. + """ + + img_info = self.data_infos[idx] + results = dict(img_info=img_info) + if self.proposals is not None: + results['proposals'] = self.proposals[idx] + self.pre_pipeline(results) + return self.pipeline(results) + + def get_cat2imgs(self): + """Get a dict with class as key and img_ids as values, which will be + used in :class:`ClassAwareSampler`. + + Returns: + dict[list]: A dict of per-label image list, + the item of the dict indicates a label index, + corresponds to the image index that contains the label. + """ + if self.custom_classes is None: + raise ValueError('self.custom_classes can not be None') + # sort the label index + cat2imgs = {i: [] for i in range(len(self.custom_classes))} + for i in range(len(self)): + cat_ids = set(self.get_cat_ids(i)) + for cat in cat_ids: + cat2imgs[cat].append(i) + return cat2imgs + + def format_results(self, results, **kwargs): + """Place holder to format result to dataset specific output.""" + print(self, results) + raise NotImplementedError + + def _filter_imgs(self, min_size=32): + """Filter images too small.""" + if self.filter_empty_gt: + warnings.warn( + 'CustomDataset does not support filtering empty gt images.') + valid_inds = [] + for i, img_info in enumerate(self.data_infos): + if min(img_info['width'], img_info['height']) >= min_size: + valid_inds.append(i) + return valid_inds + + def _set_group_flag(self): + """Set flag according to image aspect ratio. + + Images with aspect ratio greater than 1 will be set as group 1, + otherwise group 0. + """ + self.flag = np.zeros(len(self), dtype=np.uint8) + for i in range(len(self)): + img_info = self.data_infos[i] + if img_info['width'] / img_info['height'] > 1: + self.flag[i] = 1 + + def _rand_another(self, idx): + """Get another random index from the same group as the given index.""" + pool = np.where(self.flag == self.flag[idx])[0] + return np.random.choice(pool) + diff --git a/contrib/Overlap-Recovery/train/src/dataset/build_dataset.py b/contrib/Overlap-Recovery/train/src/dataset/build_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..33d2c70d93e5d4a29ad1d37d55a1a38afc2c8068 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/dataset/build_dataset.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .real_dataset import RealOverlapDataset +from .synth_dataset import SynthOverlapDataset + + +CUSTOM_DATASETS = { + 'RealOverlapDataset': RealOverlapDataset, + 'SynthOverlapDataset': SynthOverlapDataset +} + + +def build_dataset(cfg): + data_type = cfg.pop('type') + if data_type not in CUSTOM_DATASETS: + raise KeyError(f"Not support dataset type: {data_type}") + try: + return CUSTOM_DATASETS[data_type](**cfg) + except KeyError: + raise RuntimeError(KeyError) diff --git a/contrib/Overlap-Recovery/train/src/dataset/data_process.py b/contrib/Overlap-Recovery/train/src/dataset/data_process.py new file mode 100644 index 0000000000000000000000000000000000000000..3f20d6e04e2029dad786b8c366ffd46da6062525 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/dataset/data_process.py @@ -0,0 +1,717 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from os import path as osp +import warnings +from collections.abc import Sequence +import cv2 +import numpy as np +import mmcv +import mindspore as ms +from .utils import BitmapMasks + + +class DataContainer(object): + """A container for any type of objects. + + Typically tensors will be stacked in the collate function and sliced along + some dimension in the scatter function. This behavior has some limitations. + 1. All tensors have to be the same size. + 2. Types are limited (numpy array or Tensor). + + We design `DataContainer` and `MMDataParallel` to overcome these + limitations. The behavior can be either of the following. + + - copy to GPU, pad all tensors to the same size and stack them + - copy to GPU without stacking + - leave the objects as is and pass it to the model + - pad_dims specifies the number of last few dimensions to do padding + """ + + def __init__(self, + data, + stack=False, + padding_value=0, + pad_dims=2): + self._data = data + self._stack = stack + self._padding_value = padding_value + assert pad_dims in [None, 1, 2, 3] + self._pad_dims = pad_dims + + def __repr__(self): + return '{}({})'.format(self.__class__.__name__, repr(self.data)) + + @property + def data(self): + return self._data + + @property + def datatype(self): + if isinstance(self.data, ms.Tensor): + return self.data.type() + else: + return type(self.data) + + @property + def stack(self): + return self._stack + + @property + def padding_value(self): + return self._padding_value + + @property + def pad_dims(self): + return self._pad_dims + + def size(self, *args, **kwargs): + return self.data.size(*args, **kwargs) + + def dim(self): + return self.data.dim() + + +class LoadImageFromFile: + """Load an image from file.""" + + def __init__(self, + to_float32=False, + color_type='color', + channel_order='bgr'): + self.to_float32 = to_float32 + self.color_type = color_type + self.channel_order = channel_order + + def __call__(self, results): + if results['img_prefix'] is not None: + filename = osp.join(results['img_prefix'], + results['img_info']['filename']) + else: + filename = results['img_info']['filename'] + + img = cv2.imread(filename) + if self.to_float32: + img = img.astype(np.float32) + + results['filename'] = filename + results['ori_filename'] = results['img_info']['filename'] + results['img'] = img + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + results['img_fields'] = ['img'] + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'to_float32={self.to_float32}, ' + f"color_type='{self.color_type}', " + f"channel_order='{self.channel_order}' ") + return repr_str + + +class CustomLoadAnnotations: + """Customized load multiple types of annotations.""" + + def __init__(self, + with_bbox=True, + with_label=True, + with_mask=False): + self.with_bbox = with_bbox + self.with_label = with_label + self.with_mask = with_mask + + def __call__(self, results): + if self.with_bbox: + results = self._load_bboxes(results) + if results is None: + return None + if self.with_label: + results = self._load_labels(results) + if self.with_mask: + results = self._load_masks(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_mask={self.with_mask}, ' + return repr_str + + @staticmethod + def _load_bboxes(results): + ann_info = results['ann_info'] + results['gt_bboxes'] = ann_info['bboxes'].copy() + + gt_bboxes_ignore = ann_info.get('bboxes_ignore', None) + if gt_bboxes_ignore is not None: + results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy() + results['bbox_fields'].append('gt_bboxes_ignore') + results['bbox_fields'].append('gt_bboxes') + + gt_is_group_ofs = ann_info.get('gt_is_group_ofs', None) + if gt_is_group_ofs is not None: + results['gt_is_group_ofs'] = gt_is_group_ofs.copy() + + return results + + @staticmethod + def _load_labels(results): + results['gt_labels'] = results['ann_info']['labels'].copy() + results['text_labels'] = results['ann_info']['text_labels'].copy() + return results + + @staticmethod + def _load_masks(results): + h, w = results['img_info']['height'], results['img_info']['width'] + gt_masks = [cv2.imread(_, cv2.IMREAD_UNCHANGED) for _ in results['ann_info']['masks']] + gt_masks = [mask // 255 for mask in gt_masks] + gt_masks = BitmapMasks(gt_masks, h, w) + results['gt_masks'] = gt_masks + results['mask_fields'].append('gt_masks') + return results + + +class Resize: + """Resize images & bbox & mask.""" + + def __init__(self, + img_scale, + keep_ratio=True, + interpolation='bilinear'): + if isinstance(img_scale, list): + self.img_scale = img_scale + else: + self.img_scale = [img_scale] + multiscale_mode = 'range' + assert multiscale_mode in ['value', 'range'] + + self.multiscale_mode = multiscale_mode + self.keep_ratio = keep_ratio + self.interpolation = interpolation + self.override = False + self.bbox_clip_border = True + + def __call__(self, results): + if 'scale' not in results: + if 'scale_factor' in results: + img_shape = results['img'].shape[:2] + scale_factor = results['scale_factor'] + assert isinstance(scale_factor, float) + results['scale'] = tuple( + [int(x * scale_factor) for x in img_shape][::-1]) + else: + self._random_scale(results) + else: + if not self.override: + assert 'scale_factor' not in results, ( + 'scale and scale_factor cannot be both set.') + else: + results.pop('scale') + if 'scale_factor' in results: + results.pop('scale_factor') + self._random_scale(results) + + self._resize_img(results) + self._resize_bboxes(results) + self._resize_masks(results) + if len(results.get('seg_fields', [])) > 0: + raise NotImplementedError + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'multiscale_mode={self.multiscale_mode}, ' + repr_str += f'keep_ratio={self.keep_ratio}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + def _random_scale(self, results): + if len(self.img_scale) == 1: + scale, scale_idx = self.img_scale[0], 0 + else: + raise NotImplementedError + results['scale'] = scale + results['scale_idx'] = scale_idx + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + for key in results.get('img_fields', ['img']): + if self.keep_ratio: + img, scale_factor = mmcv.imrescale( + results[key], + results['scale'], + return_scale=True, + interpolation=self.interpolation) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + new_h, new_w = img.shape[:2] + h, w = results[key].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results[key], + results['scale'], + return_scale=True, + interpolation=self.interpolation) + results[key] = img + + scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], + dtype=np.float32) + results['img_shape'] = img.shape + # in case that there is no padding + results['pad_shape'] = img.shape + results['scale_factor'] = scale_factor + results['keep_ratio'] = self.keep_ratio + + def _resize_masks(self, results): + """Resize masks with ``results['scale']``""" + for key in results.get('mask_fields', []): + if results[key] is None: + continue + if self.keep_ratio: + results[key] = results[key].rescale(results['scale']) + else: + results[key] = results[key].resize(results['img_shape'][:2]) + + def _resize_bboxes(self, results): + """Resize bounding boxes with ``results['scale_factor']``.""" + for key in results.get('bbox_fields', []): + bboxes = results[key] * results['scale_factor'] + if self.bbox_clip_border: + img_shape = results['img_shape'] + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) + results[key] = bboxes + + +class RandomFlip: + """Flip the image & bbox & mask.""" + + def __init__(self, flip_ratio=None, direction='horizontal'): + if isinstance(flip_ratio, list): + assert mmcv.is_list_of(flip_ratio, float) + assert 0 <= sum(flip_ratio) <= 1 + elif isinstance(flip_ratio, float): + assert 0 <= flip_ratio <= 1 + elif flip_ratio is None: + pass + else: + raise ValueError('flip_ratios must be None, float, ' + 'or list of float') + self.flip_ratio = flip_ratio + + valid_directions = ['horizontal', 'vertical', 'diagonal'] + if isinstance(direction, str): + assert direction in valid_directions + elif isinstance(direction, list): + assert mmcv.is_list_of(direction, str) + assert set(direction).issubset(set(valid_directions)) + else: + raise ValueError('direction must be either str or list of str') + self.direction = direction + + if isinstance(flip_ratio, list): + assert len(self.flip_ratio) == len(self.direction) + + def __call__(self, results): + if 'flip' not in results: + if isinstance(self.direction, list): + # None means non-flip + direction_list = self.direction + [None] + else: + # None means non-flip + direction_list = [self.direction, None] + + if isinstance(self.flip_ratio, list): + non_flip_ratio = 1 - sum(self.flip_ratio) + flip_ratio_list = self.flip_ratio + [non_flip_ratio] + else: + non_flip_ratio = 1 - self.flip_ratio + # exclude non-flip + single_ratio = self.flip_ratio / (len(direction_list) - 1) + flip_ratio_list = [single_ratio] * (len(direction_list) - + 1) + [non_flip_ratio] + + cur_dir = np.random.choice(direction_list, p=flip_ratio_list) + + results['flip'] = cur_dir is not None + if 'flip_direction' not in results: + results['flip_direction'] = cur_dir + if results['flip']: + # flip image + for key in results.get('img_fields', ['img']): + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']) + # flip bboxes + for key in results.get('bbox_fields', []): + results[key] = self.bbox_flip(results[key], + results['img_shape'], + results['flip_direction']) + # flip masks + for key in results.get('mask_fields', []): + results[key] = results[key].flip(results['flip_direction']) + + # flip segs + for key in results.get('seg_fields', []): + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})' + + @staticmethod + def bbox_flip(bboxes, img_shape, direction): + assert bboxes.shape[-1] % 4 == 0 + flipped = bboxes.copy() + if direction == 'horizontal': + w = img_shape[1] + flipped[..., 0::4] = w - bboxes[..., 2::4] + flipped[..., 2::4] = w - bboxes[..., 0::4] + elif direction == 'vertical': + h = img_shape[0] + flipped[..., 1::4] = h - bboxes[..., 3::4] + flipped[..., 3::4] = h - bboxes[..., 1::4] + elif direction == 'diagonal': + w = img_shape[1] + h = img_shape[0] + flipped[..., 0::4] = w - bboxes[..., 2::4] + flipped[..., 1::4] = h - bboxes[..., 3::4] + flipped[..., 2::4] = w - bboxes[..., 0::4] + flipped[..., 3::4] = h - bboxes[..., 1::4] + else: + raise ValueError(f"Invalid flipping direction '{direction}'") + return flipped + + +class Normalize: + """Normalize the image.""" + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + def __call__(self, results): + for key in results.get('img_fields', ['img']): + results[key] = mmcv.imnormalize(results[key], self.mean, self.std, + self.to_rgb) + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' + return repr_str + + +class Pad: + """Pad the image & masks & segmentation map.""" + + def __init__(self, eval_model=False, **kwargs): + size = kwargs.get('size', None) + size_divisor = kwargs.get('size_divisor', None) + pad_to_square = kwargs.get('pad_to_square', False) + pad_val = kwargs.get('pad_val', None) + pad_ins_num = kwargs.get('pad_ins_num', 4) + self.size = size + self.size_divisor = size_divisor + if isinstance(pad_val, type(None)): + pad_val = dict(img=0, masks=0, seg=255) + if isinstance(pad_val, float) or isinstance(pad_val, int): + warnings.warn( + 'pad_val of float type is deprecated now, ' + f'please use pad_val=dict(img={pad_val}, ' + f'masks={pad_val}, seg=255) instead.', DeprecationWarning) + pad_val = dict(img=pad_val, masks=pad_val, seg=255) + assert isinstance(pad_val, dict) + self.pad_val = pad_val + self.pad_to_square = pad_to_square + self.pad_ins_num = pad_ins_num + self.eval_model = eval_model + + if pad_to_square: + assert size is None and size_divisor is None, \ + 'The size and size_divisor must be None ' \ + 'when pad2square is True' + else: + assert size is not None or size_divisor is not None, \ + 'only one of size and size_divisor should be valid' + assert size is None or size_divisor is None + + def __call__(self, results): + self._pad_img(results) + if self.eval_model: + return results + self._pad_masks(results) + self._pad_seg(results) + # padding instance number to predefined + to_pad = self.pad_ins_num - results['gt_bboxes'].shape[0] + if to_pad > 0: + results['gt_bboxes'] = np.concatenate([results['gt_bboxes'], + np.zeros((to_pad, 4), dtype=np.float32)], + axis=0) + results['gt_labels'] = np.concatenate([results['gt_labels'], + -np.ones((to_pad,), dtype=np.long)]) + gt_masks = results['gt_masks'].masks + h, w = gt_masks.shape[1:] + gt_masks = np.concatenate([gt_masks, + np.zeros((to_pad, h, w), dtype=gt_masks.dtype)], + axis=0) + results['gt_masks'] = BitmapMasks(gt_masks, h, w) + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.size}, ' + repr_str += f'size_divisor={self.size_divisor}, ' + repr_str += f'pad_to_square={self.pad_to_square}, ' + repr_str += f'pad_val={self.pad_val})' + return repr_str + + def _pad_img(self, results): + """Pad images according to ``self.size``.""" + pad_val = self.pad_val.get('img', 0) + for key in results.get('img_fields', ['img']): + if self.pad_to_square: + max_size = max(results[key].shape[:2]) + self.size = (max_size, max_size) + if self.size is not None: + padded_img = mmcv.impad( + results[key], shape=self.size, pad_val=pad_val) + elif self.size_divisor is not None: + padded_img = mmcv.impad_to_multiple( + results[key], self.size_divisor, pad_val=pad_val) + results[key] = padded_img + results['pad_shape'] = padded_img.shape + results['pad_fixed_size'] = self.size + results['pad_size_divisor'] = self.size_divisor + + def _pad_masks(self, results): + """Pad masks according to ``results['pad_shape']``.""" + pad_shape = results['pad_shape'][:2] + pad_val = self.pad_val.get('masks', 0) + for key in results.get('mask_fields', []): + results[key] = results[key].pad(pad_shape, pad_val=pad_val) + + def _pad_seg(self, results): + """Pad semantic segmentation map according to + ``results['pad_shape']``.""" + pad_val = self.pad_val.get('seg', 255) + for key in results.get('seg_fields', []): + results[key] = mmcv.impad( + results[key], shape=results['pad_shape'][:2], pad_val=pad_val) + + +def to_tensor(data): + """Convert objects of various python types to :obj:`mindspore.Tensor`.""" + + if isinstance(data, ms.Tensor): + return data + elif isinstance(data, np.ndarray): + return ms.Tensor.from_numpy(data) + elif isinstance(data, Sequence) and not mmcv.is_str(data): + return ms.Tensor(data) + elif isinstance(data, int): + return ms.Tensor([data], dtype=ms.int64) + elif isinstance(data, float): + return ms.Tensor([data], dtype=ms.float32) + else: + raise TypeError(f'type {type(data)} cannot be converted to tensor.') + + +class DefaultFormatBundle: + """Default formatting bundle.""" + + def __init__(self, + img_to_float=True, + pad_val=None): + if isinstance(pad_val, type(None)): + pad_val = dict(img=0, masks=0, seg=255) + self.img_to_float = img_to_float + self.pad_val = pad_val + + def __call__(self, results): + if 'img' in results: + img = results['img'] + if self.img_to_float is True and img.dtype == np.uint8: + # Normally, image is of uint8 type without normalization. + # At this time, it needs to be forced to be converted to + # flot32, otherwise the model training and inference + # will be wrong. Only used for YOLOX currently . + img = img.astype(np.float32) + # add default meta keys + results = self._add_default_meta_keys(results) + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + pad_val = self.pad_val.get('img', 0) + results['img'] = DataContainer( + to_tensor(img), padding_value=pad_val, stack=True) + for key in ['proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels']: + if key not in results: + continue + results[key] = DataContainer(to_tensor(results[key])) + if 'gt_masks' in results: + results['gt_masks'] = DataContainer( + results['gt_masks'], + padding_value=self.pad_val.get('masks', None) + ) + return results + + def __repr__(self): + return self.__class__.__name__ + \ + f'(img_to_float={self.img_to_float})' + + @staticmethod + def _add_default_meta_keys(results): + img = results['img'] + results.setdefault('pad_shape', img.shape) + results.setdefault('scale_factor', 1.0) + num_channels = 1 if len(img.shape) < 3 else img.shape[2] + results.setdefault( + 'img_norm_cfg', + dict( + mean=np.zeros(num_channels, dtype=np.float32), + std=np.ones(num_channels, dtype=np.float32), + to_rgb=False)) + return results + + +class Collect: + """Collect data from the loader relevant to the specific task.""" + + def __init__(self, + keys, + meta_keys=('filename', 'ori_filename', 'ori_shape', + 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction', 'img_norm_cfg'), + eval_mode=False): + self.keys = keys + self.meta_keys = meta_keys + self.eval_mode = eval_mode + + def __call__(self, results): + data = {} + img_meta = {} + out_data = [] + for key in self.meta_keys: + img_meta[key] = results[key] + data['img_metas'] = DataContainer(img_meta) + for key in self.keys: + data[key] = results[key] + for key in self.keys: + if self.eval_mode: + out_data.append(results[key]) + continue + if key == 'gt_masks': + out_data.append(results[key].data.masks) + else: + out_data.append(results[key].data.asnumpy()) + flip_map = { + 'horizontal': 0, + 'vertical': 1, + 'diagonal': 2 + } + for key in self.meta_keys: + if key == 'flip_direction': + if isinstance(results[key], type(None)): + out_data.append(-1) + else: + try: + out_data.append(flip_map[results[key]]) + except KeyError: + raise KeyError + else: + out_data.append(results[key]) + return tuple(out_data) + + def __repr__(self): + return self.__class__.__name__ + \ + f'(keys={self.keys}, meta_keys={self.meta_keys})' + + +class ImageToTensor: + """Convert image to :obj:`torch.Tensor` by given keys. + + The dimension order of input image is (H, W, C). The pipeline will convert + it to (C, H, W). If only 2 dimension (H, W) is given, the output would be + (1, H, W). + + Args: + keys (Sequence[str]): Key of images to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert image in results to :obj:`torch.Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`torch.Tensor` and transposed to (C, H, W) order. + """ + for key in self.keys: + img = results[key] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + results[key] = to_tensor(img.transpose(2, 0, 1)) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +CUSTOM_PIPELINES = { + 'LoadImageFromFile': LoadImageFromFile, + 'CustomLoadAnnotations': CustomLoadAnnotations, + 'Resize': Resize, + 'RandomFlip': RandomFlip, + 'Normalize': Normalize, + 'Pad': Pad, + 'DefaultFormatBundle': DefaultFormatBundle, + 'Collect': Collect, + 'ImageToTensor': ImageToTensor +} + + +class PipelineFunc: + def __init__(self, pipelines): + self.pipelines = [] + for pipe in pipelines: + pipe_type = pipe.pop('type') + try: + self.pipelines.append(CUSTOM_PIPELINES[pipe_type](**pipe)) + except KeyError: + raise KeyError + + def __call__(self, results): + for pipe in self.pipelines: + results = pipe(results) + return results diff --git a/contrib/Overlap-Recovery/train/src/dataset/real_dataset.py b/contrib/Overlap-Recovery/train/src/dataset/real_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..716d62a6ecf1dc358e37dff28ed466278480ae7a --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/dataset/real_dataset.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import os.path as osp +from tqdm import tqdm + +import cv2 +import numpy as np +import imagesize + +from .base_dataset import CustomDataset +from .utils import cal_mask_iou, cal_overlap_mask, cal_union_mask + + +class RealOverlapDataset(CustomDataset): + """Custom Synthetic Overlap dataset for text de-occlusion.""" + custom_classes = ('text', ) + + def __init__(self, score_thresh=0.5, iou_thresh=0.5, res_flags=None, **kwargs): + self.score_thresh = score_thresh + self.iou_thresh = iou_thresh + self.res_flags = res_flags + super(RealOverlapDataset, self).__init__(**kwargs) + + def load_annotations(self, ann_file): + """Load annotation from Synth Overlap""" + data_list = [] + img_dir = self.img_prefix + seg_dir = self.seg_prefix + if osp.isfile(ann_file): + with open(ann_file, 'r', encoding='utf-8') as f: + info_list = json.load(f) + for info_ in info_list: + assert len(info_) == 3, f"Invalid line: {info_}" + img_name = info_['img_name'] + data_info = dict(img_path=osp.join(img_dir, img_name)) + data_info['data_type'] = info_['data_type'] + data_info['filename'] = img_name + try: + width, height = imagesize.get(data_info['img_path']) + except KeyError: + raise RuntimeError(KeyError) + data_info['width'] = width + data_info['height'] = height + seg_map_path = [] + text_labels = [] + bboxes = [] + # should follow a pre-defined order, e.g., from top layer to bottom + for text_ins in info_['texts']: + x, y, w, h = text_ins['bbox'] + bbox = [x, y, x + w, y + h] + bboxes.append(bbox) + seg_map_path.append(osp.join(seg_dir, text_ins[f"mask"])) + text_labels.append(text_ins['label']) + data_info['bboxes'] = bboxes + data_info['seg_map_path'] = seg_map_path + data_info['text_labels'] = text_labels + data_list.append(data_info) + else: + raise NotImplementedError + return data_list + + def get_ann_info(self, idx): + data_info = self.data_infos[idx] + ann = dict( + bboxes=np.array(data_info['bboxes'], dtype=np.float32), + labels=np.zeros(len(data_info['bboxes']), dtype=np.int64), + text_labels=data_info['text_labels'], + bboxes_ignore=np.zeros((0, 4), dtype=np.float32), + masks=data_info['seg_map_path'], + seg_map=data_info['seg_map_path'] + ) + return ann + + def vis_result(self, img_idx, scores, masks, vis_dir='/home/whua/vis'): + if not os.path.exists(vis_dir): + os.mkdir(vis_dir) + if len(scores) == 0: + return + valid_idx = [] + for idx, score in enumerate(scores): + if score > self.score_thresh: + valid_idx.append(idx) + if len(valid_idx) > 0: + img = cv2.imread(self.data_infos[img_idx]['img_path']) + img_name = self.data_infos[img_idx]['img_path'].split('/')[-1].split('.')[0] + cv2.imwrite(os.path.join(vis_dir, f"{img_name}.jpg"), img) + for idx, ins_idx in enumerate(valid_idx): + save_name = f"{img_name}_{idx}.jpg" + canvas = np.zeros_like(img) + canvas[masks[ins_idx]] = img[masks[ins_idx]] + cv2.imwrite(os.path.join(vis_dir, save_name), canvas) + + def eval_func(self, idx, box_scores, masks): + # prepare gt ~ hard code + gt_masks = [cv2.imread(x, cv2.IMREAD_UNCHANGED) // 255 for x in self.data_infos[idx]['seg_map_path']] + gt_text = cal_union_mask(gt_masks) + gt_overlap = cal_overlap_mask(gt_masks) + # prepare predict of overlap and text area + box_info = box_scores[0] + if len(box_info) < 2: + raise RuntimeError + else: + # select top 2 prediction + scores = box_info[:, 4].tolist() + valid_idx = [] + for ins_idx, box_ in enumerate(box_info): + if box_[-1] > self.score_thresh: + valid_idx.append(ins_idx) + pred_masks = [masks[0][_] for _ in valid_idx] + if len(pred_masks) == 0: + pred_overlap = np.zeros_like(masks[0][0]) + pred_text = np.zeros_like(masks[0][0]) + elif len(pred_masks) == 1: + pred_overlap = np.zeros_like(masks[0][0]) + pred_text = cal_union_mask(pred_masks) + else: + pred_overlap = cal_overlap_mask(pred_masks) + pred_text = cal_union_mask(pred_masks) + if len(gt_masks) > 1: + # calculate metrics + intersection_text = (pred_text & gt_text).sum() + union_text = (pred_text | gt_text).sum() + intersection_overlap = (pred_overlap & gt_overlap).sum() + union_overlap = (pred_overlap | gt_overlap).sum() + else: + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + + # prepare predict of text instance + # filter out invalid prediction + valid_idx = [] + for ins_idx, box_ in enumerate(box_info): + if box_[-1] > self.score_thresh: + valid_idx.append(ins_idx) + match_matrix = np.zeros((len(valid_idx), len(gt_masks)), dtype=np.bool) + num_valid = len(valid_idx) + num_gt_masks = len(gt_masks) + for ins_idx in range(num_valid): + for gt_ins_idx in range(num_gt_masks): + if match_matrix[:, gt_ins_idx].sum() > 0: + continue + # calculate IoU + if cal_mask_iou(masks[0][valid_idx[ins_idx]], gt_masks[gt_ins_idx]) > self.iou_thresh: + match_matrix[ins_idx, gt_ins_idx] = True + break + # calculate instance-wise mIoU + text_ins_miou = 0 + if match_matrix.sum() > 0: + for ins_idx in range(max(match_matrix.shape)): + if ins_idx >= match_matrix.shape[0]: + # miss det + continue + else: + if ins_idx >= match_matrix.shape[1] or match_matrix[ins_idx].sum() == 0: + # wrong det + continue + else: + pred_mask = masks[0][valid_idx[ins_idx]].astype(np.bool) + gt_idx = match_matrix[ins_idx].nonzero()[0][0] + gt_mask = gt_masks[gt_idx].copy() + cur_iou = cal_mask_iou(pred_mask, gt_mask) + text_ins_miou += cur_iou + return (intersection_text, union_text, intersection_overlap, union_overlap), \ + text_ins_miou, max(match_matrix.shape) + + def evaluate(self, results, metric='segm', **kwargs): + metric = metric if isinstance(metric, str) else metric[0] + allowed_metrics = ['segm', 'segm_multi'] + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + assert len(results) == self.__len__() + if metric in ['segm']: + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + text_ins_miou_list = [] + total_ins_num = 0 + for idx, (box_scores, masks) in tqdm(enumerate(results)): + # structure: + # box_scores: List[ numpy_array with shape (num_ins, 5=4*coord+1*score) * num_classes ] + # masks: List[ List[ numpy_array_bool with shape (h, w) * num_ins ] * num_classes ] + + overall_iou_metrics, text_ins_miou, ins_num = self.eval_func(idx, box_scores, masks) + intersection_text += overall_iou_metrics[0] + union_text += overall_iou_metrics[1] + intersection_overlap += overall_iou_metrics[2] + union_overlap += overall_iou_metrics[3] + text_ins_miou_list.append(text_ins_miou) + total_ins_num += ins_num + metric_results = dict( + text_iou=intersection_text / union_text, + overlap_iou=intersection_overlap / union_overlap, + text_ins_miou=np.sum(text_ins_miou_list) / total_ins_num + ) + else: + assert len(self.res_flags) == len(results[0]) + metric_results = dict() + for flag_idx, flag in enumerate(self.res_flags): + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + text_ins_miou_list = [] + total_ins_num = 0 + for idx in tqdm(range(len(results))): + # structure: + # box_scores: List[ numpy_array with shape (num_ins, 5=4*coord+1*score) * num_classes ] + # masks: List[ List[ numpy_array_bool with shape (h, w) * num_ins ] * num_classes ] + box_scores, masks = results[idx][flag_idx] + overall_iou_metrics, text_ins_miou, ins_num = self.eval_func(idx, box_scores, masks) + intersection_text += overall_iou_metrics[0] + union_text += overall_iou_metrics[1] + intersection_overlap += overall_iou_metrics[2] + union_overlap += overall_iou_metrics[3] + text_ins_miou_list.append(text_ins_miou) + total_ins_num += ins_num + + metric_results[flag] = dict( + text_iou=intersection_text / (union_text + 1e-6) + ) + + return metric_results + + def _filter_imgs(self, min_size=32): + """Filter images too small or without ground truths.""" + valid_inds = [] + for i, img_info in enumerate(self.data_infos): + if self.filter_empty_gt and len(img_info['seg_map_path']) == 0: + if len(img_info['seg_map_path']) == 0 or len(img_info['text_labels']) == 0: + continue + if min(img_info['width'], img_info['height']) >= min_size: + valid_inds.append(i) + return valid_inds diff --git a/contrib/Overlap-Recovery/train/src/dataset/synth_dataset.py b/contrib/Overlap-Recovery/train/src/dataset/synth_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..67a20ba8b0adf43dc05250aa50735e06ea5fd2c4 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/dataset/synth_dataset.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import os.path as osp +from tqdm import tqdm + +import cv2 +import numpy as np +import imagesize + +from .base_dataset import CustomDataset +from .utils import cal_mask_iou, cal_overlap_mask, cal_union_mask + + +class SynthOverlapDataset(CustomDataset): + """Custom Synthetic Overlap dataset for text de-occlusion.""" + custom_classes = ('text', ) + + def __init__(self, score_thresh=0.5, iou_thresh=0.5, res_flags=None, **kwargs): + self.score_thresh = score_thresh + self.iou_thresh = iou_thresh + self.res_flags = res_flags + super(SynthOverlapDataset, self).__init__(**kwargs) + + def load_annotations(self, ann_file): + """Load annotation from Synth Overlap""" + data_list = [] + img_dir = self.img_prefix + seg_dir = self.seg_prefix + if osp.isfile(ann_file): + with open(ann_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + for line in lines: + info_ = json.loads(line.strip()) + assert len(info_) == 2, f"Invalid line: {line}" + img_name = info_['img_name'] + data_info = dict(img_path=osp.join(img_dir, img_name)) + data_info['filename'] = img_name + try: + width, height = imagesize.get(data_info['img_path']) + except KeyError: + raise RuntimeError(KeyError) + data_info['width'] = width + data_info['height'] = height + seg_map_path = [] + text_labels = [] + bboxes = [] + # should follow a pre-defined order, e.g., from top layer to bottom + for text_ins in info_['texts']: + x, y, w, h = text_ins['bbox'] + bbox = [x, y, x + w, y + h] + bboxes.append(bbox) + seg_map_path.append(osp.join(seg_dir, text_ins[f"mask_bin"])) + text_labels.append(text_ins['label']) + data_info['bboxes'] = bboxes + data_info['seg_map_path'] = seg_map_path + data_info['text_labels'] = text_labels + data_list.append(data_info) + else: + raise NotImplementedError + return data_list + + def get_ann_info(self, idx): + data_info = self.data_infos[idx] + ann = dict( + bboxes=np.array(data_info['bboxes'], dtype=np.float32), + labels=np.zeros(len(data_info['bboxes']), dtype=np.int64), + text_labels=data_info['text_labels'], + bboxes_ignore=np.zeros((0, 4), dtype=np.float32), + masks=data_info['seg_map_path'], + seg_map=data_info['seg_map_path'] + ) + return ann + + def vis_result(self, img_idx, scores, masks, vis_dir='/home/whua/vis'): + if not os.path.exists(vis_dir): + os.mkdir(vis_dir) + if len(scores) == 0: + return + valid_idx = [] + for idx, score in enumerate(scores): + if score > self.score_thresh: + valid_idx.append(idx) + if len(valid_idx) > 0: + img = cv2.imread(self.data_infos[img_idx]['img_path']) + img_name = self.data_infos[img_idx]['img_path'].split('/')[-1].split('.')[0] + cv2.imwrite(os.path.join(vis_dir, f"{img_name}.jpg"), img) + for idx, ins_idx in enumerate(valid_idx): + save_name = f"{img_name}_{idx}.jpg" + canvas = np.zeros_like(img) + canvas[masks[ins_idx]] = img[masks[ins_idx]] + cv2.imwrite(os.path.join(vis_dir, save_name), canvas) + + def eval_func(self, idx, box_scores, masks): + # prepare gt ~ hard code + gt_masks = [cv2.imread(x, cv2.IMREAD_UNCHANGED) // 255 for x in self.data_infos[idx]['seg_map_path']] + gt_text = cal_union_mask(gt_masks) + gt_overlap = cal_overlap_mask(gt_masks) + # prepare predict of overlap and text area + box_info = box_scores[0] + if len(box_info) < 2: + raise RuntimeError + else: + # select top 2 prediction + scores = box_info[:, 4].tolist() + valid_idx = [] + for ins_idx, box_ in enumerate(box_info): + if box_[-1] > self.score_thresh: + valid_idx.append(ins_idx) + pred_masks = [masks[0][_] for _ in valid_idx] + if len(pred_masks) == 0: + pred_overlap = np.zeros_like(masks[0][0]) + pred_text = np.zeros_like(masks[0][0]) + elif len(pred_masks) == 1: + pred_overlap = np.zeros_like(masks[0][0]) + pred_text = cal_union_mask(pred_masks) + else: + pred_overlap = cal_overlap_mask(pred_masks) + pred_text = cal_union_mask(pred_masks) + if len(gt_masks) > 1: + # calculate metrics + intersection_text = (pred_text & gt_text).sum() + union_text = (pred_text | gt_text).sum() + intersection_overlap = (pred_overlap & gt_overlap).sum() + union_overlap = (pred_overlap | gt_overlap).sum() + else: + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + + # prepare predict of text instance + # filter out invalid prediction + valid_idx = [] + for ins_idx, box_ in enumerate(box_info): + if box_[-1] > self.score_thresh: + valid_idx.append(ins_idx) + match_matrix = np.zeros((len(valid_idx), len(gt_masks)), dtype=np.bool) + num_valid = len(valid_idx) + num_gt_masks = len(gt_masks) + for ins_idx in range(num_valid): + for gt_ins_idx in range(num_gt_masks): + if match_matrix[:, gt_ins_idx].sum() > 0: + continue + # calculate IoU + if cal_mask_iou(masks[0][valid_idx[ins_idx]], gt_masks[gt_ins_idx]) > self.iou_thresh: + match_matrix[ins_idx, gt_ins_idx] = True + break + # calculate instance-wise mIoU + text_ins_miou = 0 + if match_matrix.sum() > 0: + for ins_idx in range(max(match_matrix.shape)): + if ins_idx >= match_matrix.shape[0]: + # miss det + continue + else: + if ins_idx >= match_matrix.shape[1] or match_matrix[ins_idx].sum() == 0: + # wrong det + continue + else: + pred_mask = masks[0][valid_idx[ins_idx]].astype(np.bool) + gt_idx = match_matrix[ins_idx].nonzero()[0][0] + gt_mask = gt_masks[gt_idx].copy() + cur_iou = cal_mask_iou(pred_mask, gt_mask) + text_ins_miou += cur_iou + return (intersection_text, union_text, intersection_overlap, union_overlap), \ + text_ins_miou, max(match_matrix.shape) + + def evaluate(self, results, metric='bbox',): + metric = metric if isinstance(metric, str) else metric[0] + allowed_metrics = ['segm', 'segm_multi'] + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + assert len(results) == self.__len__() + if metric in ['segm']: + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + text_ins_miou_list = [] + total_ins_num = 0 + for idx, (box_scores, masks) in tqdm(enumerate(results)): + # structure: + # box_scores: List[ numpy_array with shape (num_ins, 5=4*coord+1*score) * num_classes ] + # masks: List[ List[ numpy_array_bool with shape (h, w) * num_ins ] * num_classes ] + + overall_iou_metrics, text_ins_miou, ins_num = self.eval_func(idx, box_scores, masks) + intersection_text += overall_iou_metrics[0] + union_text += overall_iou_metrics[1] + intersection_overlap += overall_iou_metrics[2] + union_overlap += overall_iou_metrics[3] + text_ins_miou_list.append(text_ins_miou) + total_ins_num += ins_num + metric_results = dict( + text_iou=intersection_text / union_text, + overlap_iou=intersection_overlap / union_overlap, + text_ins_miou=np.sum(text_ins_miou_list) / total_ins_num + ) + else: + assert len(self.res_flags) == len(results[0]) + metric_results = dict() + for flag_idx, flag in enumerate(self.res_flags): + intersection_text = 0 + union_text = 0 + intersection_overlap = 0 + union_overlap = 0 + text_ins_miou_list = [] + total_ins_num = 0 + for idx in tqdm(range(len(results))): + # structure: + # box_scores: List[ numpy_array with shape (num_ins, 5=4*coord+1*score) * num_classes ] + # masks: List[ List[ numpy_array_bool with shape (h, w) * num_ins ] * num_classes ] + box_scores, masks = results[idx][flag_idx] + overall_iou_metrics, text_ins_miou, ins_num = self.eval_func(idx, box_scores, masks) + intersection_text += overall_iou_metrics[0] + union_text += overall_iou_metrics[1] + intersection_overlap += overall_iou_metrics[2] + union_overlap += overall_iou_metrics[3] + text_ins_miou_list.append(text_ins_miou) + total_ins_num += ins_num + + metric_results[flag] = dict( + text_iou=intersection_text / (union_text + 1e-6) + ) + + return metric_results + + def _filter_imgs(self, min_size=32): + """Filter images too small or without ground truths.""" + valid_inds = [] + for i, img_info in enumerate(self.data_infos): + if self.filter_empty_gt and len(img_info['seg_map_path']) == 0: + if len(img_info['seg_map_path']) == 0 or len(img_info['text_labels']) == 0: + continue + if min(img_info['width'], img_info['height']) >= min_size: + valid_inds.append(i) + return valid_inds diff --git a/contrib/Overlap-Recovery/train/src/dataset/utils.py b/contrib/Overlap-Recovery/train/src/dataset/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..70da521b516bdef5ab108c3a32e6b07655a5c1fc --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/dataset/utils.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import mmcv +import mindspore as ms + + +def cal_mask_iou(mask_a, mask_b, check_valid=False): + if check_valid: + assert len(np.unique(mask_a)) <= 2 + assert len(np.unique(mask_b)) <= 2 + a_bool = mask_a.astype(np.bool) + b_bool = mask_b.astype(np.bool) + intersection_area = (a_bool & b_bool).sum() + union_area = (a_bool | b_bool).sum() + if union_area == 0: + return 0 + return intersection_area / union_area + + +# def CalOverlapMask(mask_list): +def cal_overlap_mask(mask_list): + if len(mask_list) < 2: + return None + mask_list_bool = [x.astype(np.bool) for x in mask_list] + overlap_mask = np.zeros_like(mask_list_bool[0]) + for ii in range(len(mask_list_bool) - 1): + for jj in range(ii + 1, len(mask_list_bool)): + cur_olp = mask_list_bool[ii] & mask_list_bool[jj] + overlap_mask = overlap_mask | cur_olp + return overlap_mask + + +# def CalUnionMask(mask_list): +def cal_union_mask(mask_list): + if len(mask_list) < 1: + return None + mask_list_bool = [x.astype(np.bool) for x in mask_list] + union_mask = np.zeros_like(mask_list_bool[0]) + for mask_bool in mask_list_bool: + union_mask = union_mask | mask_bool + return union_mask + + +class BitmapMasks: + """This class represents masks in the form of bitmaps. + + Args: + masks (ndarray): ndarray of masks in shape (N, H, W), where N is + the number of objects. + height (int): height of masks + width (int): width of masks + """ + + def __init__(self, masks, height, width): + self.height = height + self.width = width + if isinstance(masks, ms.Tensor): + len_mask = masks.shape[0] + else: + len_mask = len(masks) + if len_mask == 0: + self.masks = np.empty((0, self.height, self.width), dtype=np.uint8) + else: + if isinstance(masks, ms.Tensor): + self.masks = masks.asnumpy() + else: + assert isinstance(masks, (list, np.ndarray)) + if isinstance(masks, list): + assert isinstance(masks[0], np.ndarray) + assert masks[0].ndim == 2 # (H, W) + else: + assert masks.ndim == 3 # (N, H, W) + + self.masks = np.stack(masks).reshape(-1, height, width) + assert self.masks.shape[1] == self.height + assert self.masks.shape[2] == self.width + + def __getitem__(self, index): + """Index the BitmapMask. + + Args: + index (int | ndarray): Indices in the format of integer or ndarray. + + Returns: + :obj:`BitmapMasks`: Indexed bitmap masks. + """ + masks = self.masks[index].reshape(-1, self.height, self.width) + return BitmapMasks(masks, self.height, self.width) + + def __iter__(self): + return iter(self.masks) + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += f'num_masks={len(self.masks)}, ' + s += f'height={self.height}, ' + s += f'width={self.width})' + return s + + def __len__(self): + """Number of masks.""" + return len(self.masks) + + @property + def areas(self): + """See :py:attr:`BaseInstanceMasks.areas`.""" + return self.masks.sum((1, 2)) + + def rescale(self, scale, interpolation='nearest'): + """See :func:`BaseInstanceMasks.rescale`.""" + if len(self.masks) == 0: + new_w, new_h = mmcv.rescale_size((self.width, self.height), scale) + rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8) + else: + rescaled_masks = np.stack([ + mmcv.imrescale(mask, scale, interpolation=interpolation) + for mask in self.masks + ]) + height, width = rescaled_masks.shape[1:] + return BitmapMasks(rescaled_masks, height, width) + + def resize(self, out_shape, interpolation='nearest'): + """See :func:`BaseInstanceMasks.resize`.""" + if len(self.masks) == 0: + resized_masks = np.empty((0, *out_shape), dtype=np.uint8) + else: + resized_masks = np.stack([ + mmcv.imresize( + mask, out_shape[::-1], interpolation=interpolation) + for mask in self.masks + ]) + return BitmapMasks(resized_masks, *out_shape) + + def flip(self, flip_direction='horizontal'): + """See :func:`BaseInstanceMasks.flip`.""" + assert flip_direction in ('horizontal', 'vertical', 'diagonal') + + if len(self.masks) == 0: + flipped_masks = self.masks + else: + flipped_masks = np.stack([ + mmcv.imflip(mask, direction=flip_direction) + for mask in self.masks + ]) + return BitmapMasks(flipped_masks, self.height, self.width) + + def pad(self, out_shape, pad_val=0): + """See :func:`BaseInstanceMasks.pad`.""" + if len(self.masks) == 0: + padded_masks = np.empty((0, *out_shape), dtype=np.uint8) + else: + padded_masks = np.stack([ + mmcv.impad(mask, shape=out_shape, pad_val=pad_val) + for mask in self.masks + ]) + return BitmapMasks(padded_masks, *out_shape) + + def crop(self, bbox): + """See :func:`BaseInstanceMasks.crop`.""" + assert isinstance(bbox, np.ndarray) + assert bbox.ndim == 1 + + # clip the boundary + bbox = bbox.copy() + bbox[0::2] = np.clip(bbox[0::2], 0, self.width) + bbox[1::2] = np.clip(bbox[1::2], 0, self.height) + x1, y1, x2, y2 = bbox + w = np.maximum(x2 - x1, 1) + h = np.maximum(y2 - y1, 1) + + if len(self.masks) == 0: + cropped_masks = np.empty((0, h, w), dtype=np.uint8) + else: + cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w] + return BitmapMasks(cropped_masks, h, w) + + def to_tensor(self, dtype): + """See :func:`BaseInstanceMasks.to_tensor`.""" + return ms.Tensor(self.masks, dtype=dtype) diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/__init__.py b/contrib/Overlap-Recovery/train/src/deoccluder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..145b9830252d6567986107442d6c7ca2ff4384ff --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from .deoccluder_r50 import CustomKNet, TrainModelWrapper diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/__init__.py b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5721ba23ef01fdb0ba5fd511ce6401ac0c37f64e --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/__init__.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from .custom_operations import CustomResizeBilinear, normal_init, multi_apply +from .custom_blocks import ConvModule, FFN, MultiheadAttention +from .custom_losses import build_loss +from .custom_samplers import build_sampler +from .custom_assigner import build_assigner diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_assigner.py b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..76879e4abb5621064602d3ae1e5746323b216163 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_assigner.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +try: + from scipy.optimize import linear_sum_assignment +except ImportError: + linear_sum_assignment = None +import numpy as np +import mindspore as ms +from mindspore import nn, ops +from .custom_match_cost import build_match_cost +from .custom_operations import NiceRepr + + +class AssignResult(NiceRepr): + """Stores assignments between predicted and truth boxes. Code inherited from mmdetection. + + Attributes: + num_gts (int): the number of truth boxes considered when computing this + assignment + + gt_inds (LongTensor): for each predicted box indicates the 1-based + index of the assigned truth box. 0 means unassigned and -1 means + ignore. + + max_overlaps (FloatTensor): the iou between the predicted box and its + assigned truth box. + + labels (None | LongTensor): If specified, for each predicted box + indicates the category label of the assigned truth box. + """ + + def __init__(self, num_gts, gt_inds, max_overlaps, labels=None): + self.num_gts = num_gts + self.gt_inds = gt_inds + self.max_overlaps = max_overlaps + self.labels = labels + # Interface for possible user-defined properties + self._extra_properties = {} + + def __nice__(self): + """str: a "nice" summary string describing this assign result""" + parts = [] + parts.append(f'num_gts={self.num_gts!r}') + if self.gt_inds is None: + parts.append(f'gt_inds={self.gt_inds!r}') + else: + parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}') + if self.max_overlaps is None: + parts.append(f'max_overlaps={self.max_overlaps!r}') + else: + parts.append('max_overlaps.shape=' + f'{tuple(self.max_overlaps.shape)!r}') + if self.labels is None: + parts.append(f'labels={self.labels!r}') + else: + parts.append(f'labels.shape={tuple(self.labels.shape)!r}') + return ', '.join(parts) + + @property + def num_preds(self): + """int: the number of predictions in this assignment""" + return len(self.gt_inds) + + @property + def info(self): + """dict: a dictionary of info about the object""" + basic_info = { + 'num_gts': self.num_gts, + 'num_preds': self.num_preds, + 'gt_inds': self.gt_inds, + 'max_overlaps': self.max_overlaps, + 'labels': self.labels, + } + basic_info.update(self._extra_properties) + return basic_info + + def set_extra_property(self, key, value): + """Set user-defined new property.""" + assert key not in self.info + self._extra_properties[key] = value + + def get_extra_property(self, key): + """Get user-defined property.""" + return self._extra_properties.get(key, None) + + def add_gt_(self, gt_labels): + """Add ground truth as assigned results. + + Args: + gt_labels (torch.Tensor): Labels of gt boxes + """ + self_inds = ms.Tensor(np.arange( + 1, len(gt_labels) + 1), dtype=ms.int32) + self.gt_inds = ops.concat([self_inds, self.gt_inds]) + + self.max_overlaps = ops.concat( + [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps]) + + if self.labels is not None: + self.labels = ops.concat([gt_labels, self.labels]) + + +class MaskHungarianAssigner(nn.Cell): + """Computes one-to-one matching between predictions and ground truth.""" + + def __init__(self, + boundary_cost=None, + topk=1, + **kwargs): + super(MaskHungarianAssigner, self).__init__() + cls_cost = kwargs.get('cls_cost', dict(type='ClassificationCost', weight=1.)) + mask_cost = kwargs.get('mask_cost', dict(type='SigmoidCost', weight=1.0)) + dice_cost = kwargs.get('dice_cost', dict()) + self.cls_cost = build_match_cost(cls_cost) + self.mask_cost = build_match_cost(mask_cost) + self.dice_cost = build_match_cost(dice_cost) + if boundary_cost is not None: + self.boundary_cost = build_match_cost(boundary_cost) + else: + self.boundary_cost = None + self.topk = topk + + def assign(self, + bbox_pred, + cls_pred, + gt_bboxes, + gt_labels, + img_meta=None, + gt_bboxes_ignore=None, + eps=1e-7, + **kwargs): + """Computes one-to-one matching based on the weighted costs. + + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + + 1. assign every prediction to -1 + 2. compute the weighted costs + 3. do Hungarian matching on CPU based on the costs + 4. assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + cls_pred (Tensor): Predicted classification logits, shape + [num_query, num_class]. + gt_bboxes (Tensor): Ground truth boxes with unnormalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + img_meta (dict): Meta information for current image. + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`. Default None. + eps (int | float, optional): A value added to the denominator for + numerical stability. Default 1e-7. + + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert gt_bboxes_ignore is None, \ + 'Only case when gt_bboxes_ignore is None is supported.' + num_gts, num_bboxes = gt_bboxes.shape[0], bbox_pred.shape[0] + + # 1. assign -1 by default + assigned_gt_inds = ms.numpy.full((num_bboxes, ), -1, dtype=ms.int64) + assigned_labels = ms.numpy.full((num_bboxes, ), -1, dtype=ms.int64) + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels) + + # 2. compute the weighted costs + # classification and bboxcost. + if self.cls_cost.weight != 0 and cls_pred is not None: + cls_cost = self.cls_cost(cls_pred, gt_labels) + else: + cls_cost = 0 + if self.mask_cost.weight != 0: + reg_cost = self.mask_cost(bbox_pred, gt_bboxes) + else: + reg_cost = 0 + if self.dice_cost.weight != 0: + dice_cost = self.dice_cost(bbox_pred, gt_bboxes) + else: + dice_cost = 0 + if self.boundary_cost is not None and self.boundary_cost.weight != 0: + b_cost = self.boundary_cost(bbox_pred, gt_bboxes) + else: + b_cost = 0 + cost = cls_cost + reg_cost + dice_cost + b_cost + + # 3. do Hungarian matching on CPU using linear_sum_assignment + cost = cost.asnumpy() + if linear_sum_assignment is None: + raise NotImplementedError('Please run "pip install scipy" to install scipy first.' ) + if self.topk == 1: + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + else: + topk_matched_row_inds = [] + topk_matched_col_inds = [] + for i in range(self.topk): + matched_row_inds, matched_col_inds = linear_sum_assignment( + cost) + topk_matched_row_inds.append(matched_row_inds) + topk_matched_col_inds.append(matched_col_inds) + cost[matched_row_inds] = 1e10 + matched_row_inds = np.concatenate(topk_matched_row_inds) + matched_col_inds = np.concatenate(topk_matched_col_inds) + + matched_row_inds = ms.Tensor.from_numpy(matched_row_inds) + matched_col_inds = ms.Tensor.from_numpy(matched_col_inds) + + # 4. assign backgrounds and foregrounds + # assign all indices to backgrounds first + assigned_gt_inds[:] = 0 + # assign foregrounds based on matching results + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels) + + +CUSTOM_ASSIGNER = { + 'MaskHungarianAssigner': MaskHungarianAssigner +} + + +def build_assigner(cfg): + assigner_type = cfg.pop('type') + try: + return CUSTOM_ASSIGNER[assigner_type](**cfg) + except KeyError: + raise KeyError diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_blocks.py b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..cfbc5111d55843a6e0c6cd962a0e1881d99205de --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_blocks.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import warnings +import mindspore as ms +from mindspore import nn, ops +from src.model_utils.configs.config_base import config + + +class ConvModule(nn.Cell): + def __init__(self, in_channels, out_channels, kernel_size=1, **kwargs): + super().__init__() + norm_cfg = kwargs.get('norm_cfg', None) + act_cfg = kwargs.get('act_cfg', None) + if norm_cfg is not None: + bias = False + else: + bias = True + self.conv = nn.Conv2d(in_channels, + out_channels, + kernel_size, + stride=kwargs.get('stride', 1), + pad_mode='pad', + padding=kwargs.get('padding', 0), + group=kwargs.get('groups', 1), + dilation=kwargs.get('dilation', 1), + has_bias=bias) + + self.norm = None + if norm_cfg: + if norm_cfg['type'] == 'BN': + self.norm = nn.BatchNorm2d(out_channels, momentum=0.9) + elif norm_cfg['type'] == 'GN': + self.norm = nn.GroupNorm(norm_cfg['num_groups'], out_channels) + elif norm_cfg['type'] == 'LN': + self.norm = nn.LayerNorm(norm_cfg['normalized_shape']) + else: + raise TypeError('Unknown normalization layer') + + self.act = None + if act_cfg: + if act_cfg['type'] == 'ReLU': + self.act = nn.ReLU() + elif act_cfg['type'] == 'Sigmoid': + self.act = nn.Sigmoid() + else: + raise TypeError('Unknown activation layer') + + def construct(self, x): + out = self.conv(x) + if self.norm is not None: + out = self.norm(out) + if self.act is not None: + out = self.act(out) + return out + + +class FFN(nn.Cell): + """Implements feed-forward networks (FFNs) with identity connection. + + Args: + embed_dims (int): The feature dimension. Same as + `MultiheadAttention`. Defaults: 256. + feedforward_channels (int): The hidden dimension of FFNs. + Defaults: 1024. + num_fcs (int, optional): The number of fully-connected layers in + FFNs. Default: 2. + act_cfg (dict, optional): The activation config for FFNs. + Default: dict(type='ReLU') + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Default 0.0. + add_identity (bool, optional): Whether to add the + identity connection. Default: `True`. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + """ + + def __init__(self, embed_dims=256, feedforward_channels=1024, num_fcs=2, **kwargs): + act_cfg = kwargs.get('act_cfg', None) + dropout_layer = kwargs.get('dropout_layer', None) + super(FFN, self).__init__() + if isinstance(act_cfg, type(None)): + act_cfg = dict(type='ReLU') + assert num_fcs >= 2, 'num_fcs should be no less ' \ + f'than 2. got {num_fcs}.' + self.embed_dims = embed_dims + self.feedforward_channels = feedforward_channels + self.num_fcs = num_fcs + self.act_cfg = act_cfg + if act_cfg and act_cfg.get('type') == 'ReLU': + self.activate = nn.ReLU() + else: + raise RuntimeError(f"Not support cfg: {act_cfg}") + + layers = [] + in_channels = embed_dims + for _ in range(num_fcs - 1): + layers.append( + nn.SequentialCell( + nn.Dense(in_channels, feedforward_channels), self.activate, + )) + in_channels = feedforward_channels + layers.append(nn.Dense(feedforward_channels, embed_dims)) + self.layers = nn.SequentialCell(*layers) + if dropout_layer: + self.dropout_layer = nn.Dropout() + else: + self.dropout_layer = None + self.add_identity = kwargs.get('add_identity', True) + + def construct(self, x, identity=None): + """Forward function for `FFN`. + + The function would add x to the output tensor if residue is None. + """ + out = self.layers(x) + if not self.add_identity: + if self.dropout_layer is not None: + out = self.dropout_layer(out) + return out + if identity is None: + identity = x + if self.dropout_layer is not None: + out = self.dropout_layer(out) + return identity + out + + +class MultiheadAttention(nn.Cell): + """A wrapper for ``torch.nn.MultiheadAttention``. + + This module implements MultiheadAttention with identity connection, + and positional encoding is also passed as input. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): When it is True, Key, Query and Value are shape of + (batch, n, embed_dim), otherwise (n, batch, embed_dim). + Default to True for mindspore. + """ + + def __init__(self, + embed_dims, + num_heads, + attn_drop=0., + proj_drop=0., + dropout_layer=0., + init_cfg=None, + batch_first=True, + num_proposals=4, + **kwargs): + super().__init__(init_cfg) + self.embed_dims = embed_dims + self.num_heads = num_heads + self.batch_first = batch_first + batch_size = config.data['samples_per_gpu'] if config.train else 1 + self.attn = nn.transformer.MultiHeadAttention( + batch_size, num_proposals, num_proposals, embed_dims, num_heads, + attention_dropout_rate=attn_drop, **kwargs) + if proj_drop > 0: + self.proj_drop = nn.Dropout(proj_drop) + else: + self.proj_drop = None + self.num_proposals = num_proposals + + if dropout_layer > 0: + self.dropout_layer = nn.Dropout(dropout_layer) + else: + self.dropout_layer = None # nn.Identity() + + def construct(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_pos=None, + attn_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `MultiheadAttention`. + + **kwargs allow passing a more general data flow when combining + with other operations in `transformerlayer`. + + Args: + query (Tensor): The input query with shape [num_queries, bs, + embed_dims] if self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + If None, the ``query`` will be used. Defaults to None. + value (Tensor): The value tensor with same shape as `key`. + Same in `nn.MultiheadAttention.forward`. Defaults to None. + If None, the `key` will be used. + identity (Tensor): This tensor, with the same shape as x, + will be used for the identity link. + If None, `x` will be used. Defaults to None. + query_pos (Tensor): The positional encoding for query, with + the same shape as `x`. If not None, it will + be added to `x` before forward function. Defaults to None. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. Defaults to None. If not None, it will + be added to `key` before forward function. If None, and + `query_pos` has the same shape as `key`, then `query_pos` + will be used for `key_pos`. Defaults to None. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + + Returns: + Tensor: forwarded results with shape + [num_queries, bs, embed_dims] + if self.batch_first is False, else + [bs, num_queries embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + if identity is None: + identity = query + if key_pos is None: + if query_pos is not None: + # use query_pos if key_pos is not available + if query_pos.shape == key.shape: + key_pos = query_pos + else: + warnings.warn(f'position encoding of key is' + f'missing in {self.__class__.__name__}.') + if query_pos is not None: + query = query + query_pos + if key_pos is not None: + key = key + key_pos + + # Because the dataflow('key', 'query', 'value') of + # ``torch.nn.MultiheadAttention`` is (num_query, batch, + # embed_dims), We should adjust the shape of dataflow from + # batch_first (batch, num_query, embed_dims) to num_query_first + # (num_query ,batch, embed_dims), and recover ``attn_output`` + # from num_query_first to batch_first. + if self.batch_first: + query = query.transpose((1, 0, 2)) + key = key.transpose((1, 0, 2)) + value = value.transpose((1, 0, 2)) + batch_size, num_sample, _ = query.shape + else: + num_sample, batch_size, _ = query.shape + out = self.attn( + query_tensor=query, + key_tensor=key, + value_tensor=value, + attention_mask=ops.ones((batch_size, num_sample, num_sample), ms.float32))[0] + + if self.batch_first: + out = out.transpose((1, 0, 2)) + + if self.proj_drop is not None: + out = self.proj_drop(out) + + if self.dropout_layer is not None: + out = self.dropout_layer(out) + return identity + out + diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_losses.py b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_losses.py new file mode 100644 index 0000000000000000000000000000000000000000..8f6f2ff2676616ddc41ee487c8b4538b1d43735b --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_losses.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import mindspore as ms +import numpy as np +from mindspore import nn, ops + + +class BinaryCrossEntropy(nn.Cell): + def __init__(self, loss_weight=1, reduction='mean', use_sigmoid=True): + super(BinaryCrossEntropy, self).__init__() + self.bce_loss = ops.binary_cross_entropy_with_logits + self.reduction = reduction + self.loss_weight = loss_weight + self.use_sigmoid = use_sigmoid + assert self.use_sigmoid + + def construct(self, pred, label): + return self.loss_weight * ops.binary_cross_entropy_with_logits( + pred, label, ops.ones(pred.shape, ms.float32), + ops.ones(pred.shape, ms.float32), reduction=self.reduction) + + +class FocalLoss(nn.Cell): + def __init__(self, gamma=2.0, loss_weight=1, reduction='sum', use_sigmoid=True): + super(FocalLoss, self).__init__() + self.focal_loss = nn.FocalLoss(gamma=gamma, reduction=reduction) + self.loss_weight = loss_weight + self.use_sigmoid = use_sigmoid + + def construct(self, pred, label, avg_factor): + return self.loss_weight * self.focal_loss(pred, label) / avg_factor + + +class SigmoidFocalClassificationLoss(nn.Cell): + """" + Sigmoid focal-loss for classification. + + Args: + gamma (float): Hyper-parameter to balance the easy and hard examples. Default: 2.0 + alpha (float): Hyper-parameter to balance the positive and negative example. Default: 0.25 + + Returns: + Tensor, the focal loss. + """ + def __init__(self, gamma=2.0, alpha=0.25, loss_weight=1.0, use_sigmoid=True): + super(SigmoidFocalClassificationLoss, self).__init__() + self.sigmiod_cross_entropy = ops.SigmoidCrossEntropyWithLogits() + self.sigmoid = ops.Sigmoid() + self.pow = ops.Pow() + self.onehot = ops.OneHot() + self.on_value = ms.Tensor(1.0, ms.float32) + self.off_value = ms.Tensor(0.0, ms.float32) + self.gamma = gamma + self.alpha = alpha + self.loss_weight = loss_weight + self.use_sigmoid = use_sigmoid + + def construct(self, logits, label): + label = self.onehot(label, ops.shape(logits)[-1], self.on_value, self.off_value) + sigmiod_cross_entropy = self.sigmiod_cross_entropy(logits, label) + sigmoid = self.sigmoid(logits) + label = ops.cast(label, ms.float32) + p_t = label * sigmoid + (1 - label) * (1 - sigmoid) + modulating_factor = self.pow(1 - p_t, self.gamma) + alpha_weight_factor = label * self.alpha + (1 - label) * (1 - self.alpha) + focal_loss = modulating_factor * alpha_weight_factor * sigmiod_cross_entropy + return self.loss_weight * focal_loss + + +class DiceLoss(nn.Cell): + def __init__(self, loss_weight=1, use_sigmoid=True): + super(DiceLoss, self).__init__() + self.dice_loss = nn.DiceLoss() + self.loss_weight = loss_weight + self.use_sigmoid = use_sigmoid + assert self.use_sigmoid + self.sigmoid = ops.Sigmoid() + + def construct(self, pred, label): + return self.loss_weight * self.dice_loss(self.sigmoid(pred), label) + + +class CLSBCELoss(nn.Cell): + def __init__(self, loss_weight=1, use_sigmoid=True, reduction='mean'): + super(CLSBCELoss, self).__init__() + self.bce_loss = nn.CrossEntropyLoss(reduction=reduction) + self.loss_weight = loss_weight + self.use_sigmoid = use_sigmoid + self.sigmoid = ops.Sigmoid() + + def construct(self, pred, label): + return self.loss_weight * self.bce_loss(pred, label) + + +CUSTOM_LOSSES = { + 'BinaryCrossEntropy': BinaryCrossEntropy, + 'FocalLoss': FocalLoss, + 'DiceLoss': DiceLoss, + 'CLSBCELoss': CLSBCELoss, + 'SigmoidFocalClassificationLoss': SigmoidFocalClassificationLoss +} + + +def build_loss(loss_cfg: dict): + loss_type = loss_cfg.pop('type') + try: + return CUSTOM_LOSSES[loss_type](**loss_cfg) + except KeyError: + raise KeyError diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_match_cost.py b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_match_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..16ddc260460deb56de88d77baa414dc72e5a398a --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_match_cost.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import mindspore as ms +from mindspore import nn, ops +from .custom_operations import custom_ein + + +class FocalLossCost: + """FocalLossCost. + + Args: + weight (int | float, optional): loss_weight + alpha (int | float, optional): focal_loss alpha + gamma (int | float, optional): focal_loss gamma + eps (float, optional): default 1e-12 + binary_input (bool, optional): Whether the input is binary, + default False. + + Examples: + from mmdet.core.bbox.match_costs.match_cost import FocalLossCost + import torch + self = FocalLossCost() + cls_pred = torch.rand(4, 3) + gt_labels = torch.tensor([0, 1, 2]) + factor = torch.tensor([10, 8, 10, 8]) + self(cls_pred, gt_labels) + tensor([[-0.3236, -0.3364, -0.2699], + [-0.3439, -0.3209, -0.4807], + [-0.4099, -0.3795, -0.2929], + [-0.1950, -0.1207, -0.2626]]) + """ + + def __init__(self, weight=1., alpha=0.25, gamma=2, **kwargs): + self.weight = weight + self.alpha = alpha + self.gamma = gamma + self.eps = kwargs.get('eps', 1e-12) + self.binary_input = kwargs.get('binary_input', False) + + def __call__(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classfication logits. + gt_labels (Tensor)): Labels. + + Returns: + Tensor: Focal cost matrix with weight in shape\ + (num_query, num_gt). + """ + if self.binary_input: + return self._mask_focal_loss_cost(cls_pred, gt_labels) + else: + return self._focal_loss_cost(cls_pred, gt_labels) + + def _focal_loss_cost(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + (num_query, num_class). + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + gt_numpy = gt_labels.asnumpy() + cls_cost = ms.Tensor(pos_cost.asnumpy()[:, gt_numpy]) - ms.Tensor(neg_cost.asnumpy()[:, gt_numpy]) + return cls_cost * self.weight + + def _mask_focal_loss_cost(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classfication logits + in shape (num_query, d1, ..., dn), dtype=torch.float32. + gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn), + dtype=torch.long. Labels should be binary. + + Returns: + Tensor: Focal cost matrix with weight in shape\ + (num_query, num_gt). + """ + cls_pred = cls_pred.flatten(1) + gt_labels = gt_labels.flatten(1).astype(ms.float32) + n = cls_pred.shape[1] + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + + einsum = ops.Einsum('nc,mc->nm') + cls_cost = einsum((pos_cost, gt_labels)) + einsum((neg_cost, (1 - gt_labels))) + return cls_cost / n * self.weight + + +class DiceCost(object): + """DiceCost. + + Args: + weight (int | float, optional): loss_weight + pred_act (bool): Whether to activate the prediction + before calculating cost + + Examples: + from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost + import torch + self = BBoxL1Cost() + bbox_pred = torch.rand(1, 4) + gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) + factor = torch.tensor([10, 8, 10, 8]) + self(bbox_pred, gt_bboxes, factor) + tensor([[1.6172, 1.6422]]) + """ + + def __init__(self, + weight=1., + pred_act=False, + act_mode='sigmoid', + eps=1e-3): + self.weight = weight + self.pred_act = pred_act + self.act_mode = act_mode + self.eps = eps + + def __call__(self, mask_preds, gt_masks): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + + Returns: + torch.Tensor: bbox_cost value with weight + """ + if self.pred_act and self.act_mode == 'sigmoid': + mask_preds = mask_preds.sigmoid() + elif self.pred_act: + mask_preds = mask_preds.softmax(dim=0) + dice_cost = self.custom_dc_loss(mask_preds, gt_masks, self.eps) + return dice_cost * self.weight + + def custom_dc_loss(cls, input_x, target, eps=1e-3): + input_x = input_x.reshape(input_x.shape[0], -1) + target = target.reshape(target.shape[0], -1).astype(ms.float32) + # einsum saves 10x memory + a = custom_ein('nh,mh->nm', input_x, target) + b = ops.reduce_sum(input_x * input_x, 1) + eps + c = ops.reduce_sum(target * target, 1) + eps + d = (2 * a) / (b[:, None] + c[None, ...]) + # 1 is a constance that will not affect the matching, so ommitted + return -d + + +class MaskCost(object): + """MaskCost. + + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'): + self.weight = weight + self.pred_act = pred_act + self.act_mode = act_mode + + def __call__(self, cls_pred, target): + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + [num_query, num_class]. + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + if self.pred_act and self.act_mode == 'sigmoid': + cls_pred = cls_pred.sigmoid() + elif self.pred_act: + cls_pred = cls_pred.softmax(dim=0) + + _, height, width = target.shape + # eingum is ~10 times faster than matmul + pos_cost = custom_ein('nhw,mhw->nm', cls_pred, target) + neg_cost = custom_ein('nhw,mhw->nm', 1 - cls_pred, 1 - target) + cls_cost = -(pos_cost + neg_cost) / (height * width) + return cls_cost * self.weight + + +CUSTOM_MATCH_COST = { + 'FocalLossCost': FocalLossCost, + 'DiceCost': DiceCost, + 'MaskCost': MaskCost +} + + +def build_match_cost(cfg): + cost_type = cfg.pop('type') + try: + return CUSTOM_MATCH_COST[cost_type](**cfg) + except KeyError: + raise KeyError diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_operations.py b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_operations.py new file mode 100644 index 0000000000000000000000000000000000000000..3f56ae35ec350d639bfa4ef761a633355aa65c27 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_operations.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from functools import partial +import warnings +import numpy as np +import mindspore as ms +import mindspore.nn as nn +from mindspore.common import initializer as init + + +class NiceRepr: + """Inherit from this class and define ``__nice__`` to "nicely" print your + objects. + + Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function + Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``. + If the inheriting class has a ``__len__``, method then the default + ``__nice__`` method will return its length. + + Code inherited from mmdetection. + """ + + def __nice__(self): + """str: a "nice" summary string describing this module""" + if hasattr(self, '__len__'): + # It is a common pattern for objects to use __len__ in __nice__ + # As a convenience we define a default __nice__ for these objects + return str(len(self)) + else: + # In all other cases force the subclass to overload __nice__ + raise NotImplementedError( + f'Define the __nice__ method for {self.__class__!r}') + + def __repr__(self): + """str: the string of the module""" + try: + nice = self.__nice__() + classname = self.__class__.__name__ + return f'<{classname}({nice}) at {hex(id(self))}>' + except NotImplementedError as ex: + warnings.warn(str(ex), category=RuntimeWarning) + return object.__repr__(self) + + def __str__(self): + """str: the string of the module""" + try: + classname = self.__class__.__name__ + nice = self.__nice__() + return f'<{classname}({nice})>' + except NotImplementedError as ex: + warnings.warn(str(ex), category=RuntimeWarning) + return object.__repr__(self) + + +class CustomResizeBilinear(nn.ResizeBilinear): + def __init__(self, size=None, scale_factor=None, align_corners=False, **kwargs): + super(CustomResizeBilinear, self).__init__(**kwargs) + self.size = size + self.scale_factor = scale_factor + self.align_corners = align_corners + + def construct(self, x, **kwargs): + return super(CustomResizeBilinear, self).construct( + x, self.size, self.scale_factor, self.align_corners) + + +def normal_init(cell: nn.Cell, + init_gain: float = 0.02, + mean: float = 0, + bias: float = 0) -> None: + if hasattr(cell, 'weight') and cell.weight is not None: + cell.weight.set_data(init.initializer( + init.Normal(init_gain, mean), cell.weight.shape)) + if hasattr(cell, 'bias') and cell.bias is not None: + cell.bias.set_data(init.initializer(bias, cell.bias.shape)) + + +def multi_apply(func, *args, **kwargs): + """Apply function to a list of arguments. + + Note: + This function applies the ``func`` to multiple inputs and + map the multiple outputs of the ``func`` into different + list. Each list contains the same type of outputs corresponding + to different inputs. + + Args: + func (Function): A function that will be applied to a list of + arguments + + Returns: + tuple(list): A tuple containing multiple list, each list contains \ + a kind of returned results by the function + """ + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +def custom_ein(def_format, x, y): + return ms.Tensor(np.einsum(def_format, x.asnumpy(), y.asnumpy()), dtype=x.dtype) diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_samplers.py b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_samplers.py new file mode 100644 index 0000000000000000000000000000000000000000..b29fbc618cc383a600d0f2ac386a72dae88f41e3 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/custom_cells/custom_samplers.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import mindspore as ms +from mindspore import ops, nn +from .custom_operations import NiceRepr + + +class MaskSamplingResult(NiceRepr): + """Bbox sampling result. + + Example: + self = + """ + + def __init__(self, pos_inds, neg_inds, masks, **kwargs): + gt_masks = kwargs.get('gt_masks', None) + assign_result = kwargs.get('assign_result', None) + gt_flags = kwargs.get('gt_flags', None) + self.pos_inds = pos_inds + self.neg_inds = neg_inds + if pos_inds.shape[0] == 0: + height, width = masks.shape[-2:] + self.pos_masks = np.zeros((0, height, width)) + else: + self.pos_masks = masks[pos_inds] + if neg_inds.shape[0] == 0: + height, width = masks.shape[-2:] + self.neg_masks = np.zeros((0, height, width)) + else: + self.neg_masks = masks[neg_inds] + self.pos_is_gt = gt_flags[pos_inds] + + self.num_gts = gt_masks.shape[0] + self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + size = ops.Size() + if size(gt_masks) == 0: + # hack for index error case + assert self.pos_assigned_gt_inds.numel() == 0 + self.pos_gt_masks = ms.numpy.empty_like(gt_masks) + else: + self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds] + + if assign_result.labels is not None: + self.pos_gt_labels = assign_result.labels[pos_inds] + else: + self.pos_gt_labels = None + + def __nice__(self): + data = self.info.copy() + data['pos_masks'] = data.pop('pos_masks').shape + data['neg_masks'] = data.pop('neg_masks').shape + parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] + body = ' ' + ',\n '.join(parts) + return '{\n' + body + '\n}' + + @property + def masks(self): + """torch.Tensor: concatenated positive and negative boxes""" + return ops.concat([self.pos_masks, self.neg_masks]) + + @property + def bboxes(self): + """torch.Tensor: concatenated positive and negative boxes""" + return ops.concat([self.pos_bboxes, self.neg_bboxes]) + + @property + def info(self): + """Returns a dictionary of info about the object.""" + return { + 'pos_inds': self.pos_inds, + 'neg_inds': self.neg_inds, + 'pos_masks': self.pos_masks, + 'neg_masks': self.neg_masks, + 'pos_is_gt': self.pos_is_gt, + 'num_gts': self.num_gts, + 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, + } + + +class MaskPseudoSampler(nn.Cell): + """A pseudo sampler that does not do sampling actually.""" + + def __init__(self, **kwargs): + super(MaskPseudoSampler, self).__init__() + + def sample(self, assign_result, masks, gt_masks, **kwargs): + """Directly returns the positive and negative indices of samples. + + Args: + assign_result (:obj:`AssignResult`): Assigned results + masks (torch.Tensor): Bounding boxes + gt_masks (torch.Tensor): Ground truth boxes + + Returns: + :obj:`SamplingResult`: sampler results + """ + inds_numpy = assign_result.gt_inds.asnumpy() + pos_inds = ms.Tensor(np.unique(np.nonzero(inds_numpy > 0)[0])) + neg_inds = ms.Tensor(np.unique(np.nonzero(inds_numpy == 0)[0])) + + zeros = ops.Zeros() + gt_flags = zeros((masks.shape[0], ), ms.uint8) + + sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, + gt_masks=gt_masks, + assign_result=assign_result, + gt_flags=gt_flags) + return sampling_result + + +CUSTOM_SAMPLER = { + 'MaskPseudoSampler': MaskPseudoSampler +} + + +def build_sampler(cfg: dict): + sampler_type = cfg.pop('type') + try: + return CUSTOM_SAMPLER[sampler_type](**cfg) + except KeyError: + raise KeyError diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/deoccluder_r50.py b/contrib/Overlap-Recovery/train/src/deoccluder/deoccluder_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..43e6aa0757418dfc8fa747477b6bf0645d7f8ab5 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/deoccluder_r50.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import mindspore as ms +from mindspore import nn, ops +from mindspore import load_checkpoint, load_param_into_net +from mindspore.nn.optim import Adam +from mindspore.context import ParallelMode +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.communication.management import get_group_size +from src.model_utils.configs.config_base import Config +from ..dataset.utils import BitmapMasks +from .resnet import resnet50 +from .fpn_neck import FeatPyramidNeck +from .rpn.kernel_head import ConvKernelHead +from .roi.custom_kernel_iter_head import CustomKernelIterHead +from .utils import sem2ins_masks + + +class CustomKNet(nn.Cell): + def __init__(self, config): + super(CustomKNet, self).__init__() + self.config = Config(config) + self.mask_assign_stride = self.config.mask_assign_stride + # build backbone (resnet-50) + self.backbone = resnet50(pretrained=False) + # build FPN + self.neck = FeatPyramidNeck(**self.config.neck, + feature_shapes=self.config.feature_shapes) + # build RPN head + self.rpn_head = ConvKernelHead(**self.config.rpn_head) + + # build ROI head + self.roi_head = CustomKernelIterHead(**self.config.roi_head) + + self.interpolate = nn.ResizeBilinear() + + self.is_model_export = False + + self.reduce_sum = ops.ReduceSum() + + def load_r50(self, ckpt_path, prefix='backbone'): + param_dict = load_checkpoint(ckpt_path) + if prefix: + prefix_param_dict = dict() + for key, val in param_dict.items(): + prefix_param_dict[f"{prefix}.{key}"] = val + param_dict = prefix_param_dict + load_param_into_net(self.backbone, param_dict) + + def extract_feat(self, img): + """Directly extract features from the backbone+neck.""" + x = self.backbone(img) + x = self.neck(x) + return x + + def forward_train(self, img, img_metas, gt_bboxes=None, gt_labels=None, + gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None): + assert gt_masks is not None + + # gt_masks and gt_semantic_seg are not padded when forming batch + gt_masks_tensor = [] + gt_sem_seg = [] + gt_sem_cls = [] + # batch_input_shape shoud be the same across images + pad_h, pad_w = img_metas[0]['batch_input_shape'] + assign_height = pad_h // self.mask_assign_stride + assign_width = pad_w // self.mask_assign_stride + + for i, gt_mask in enumerate(gt_masks): + mask_tensor = gt_mask.to_tensor(ms.float32) + if gt_mask.width != pad_w or gt_mask.height != pad_h: + pad_wh = ((0, 0), (0, pad_h - gt_mask.height), (0, pad_w - gt_mask.width)) + pad_op = nn.Pad(paddings=pad_wh) + mask_tensor = pad_op(mask_tensor) + + if gt_semantic_seg is not None: + # gt_semantic seg is padded by 255 and + # zero indicating the first class + sem_labels, sem_seg = sem2ins_masks( + gt_semantic_seg[i], + num_thing_classes=self.num_thing_classes) + if sem_seg.shape[0] == 0: + gt_sem_seg.append( + mask_tensor.new_zeros( + (mask_tensor.shape[0], assign_height, assign_width))) + else: + gt_sem_seg.append( + self.interpolate( + sem_seg[None], (assign_height, assign_width), + align_corners=False)[0]) + gt_sem_cls.append(sem_labels) + + else: + gt_sem_seg = None + gt_sem_cls = None + if mask_tensor.shape[0] == 0: + gt_masks_tensor.append( + mask_tensor.new_zeros( + (mask_tensor.shape[0], assign_height, assign_width))) + else: + gt_masks_tensor.append( + self.interpolate( + mask_tensor[None], (assign_height, assign_width), + align_corners=False)[0]) + gt_masks = gt_masks_tensor + x = self.extract_feat(img) + rpn_results = self.rpn_head.forward_train(x, gt_masks, + gt_labels, + img_metas=img_metas, + gt_sem_seg=gt_sem_seg, + gt_sem_cls=gt_sem_cls) + + (rpn_losses, proposal_feats, x_feats, mask_preds, + cls_scores) = rpn_results + losses = self.roi_head.forward_train( + x_feats, + proposal_feats, + mask_preds, + gt_bboxes_ignore=gt_bboxes_ignore, + gt_bboxes=gt_bboxes, + gt_sem_seg=gt_sem_seg, + gt_sem_cls=gt_sem_cls, + imgs_whwh=None, + img_metas=img_metas, + gt_masks=gt_masks, + gt_labels=gt_labels, + cls_score=cls_scores + ) + + losses.update(rpn_losses) + total_loss = None + for key, val in losses.items(): + if isinstance(total_loss, ms.Tensor): + total_loss += val + else: + total_loss = val + return total_loss + + def simple_test(self, img, img_metas, rescale=False): + x = self.extract_feat(img) + rpn_results = self.rpn_head.simple_test_rpn(x, img_metas) + (proposal_feats, x_feats, mask_preds, cls_scores, + seg_preds) = rpn_results + segm_results = self.roi_head.simple_test( + x_feats, + proposal_feats, + mask_preds, + cls_scores, + img_metas, + imgs_whwh=None, + rescale=rescale) + list_segm_results = [] + for segm in segm_results: + list_segm_results.append(list(segm)) + return list_segm_results + + def construct(self, img, gt_bboxes=None, gt_label=None, gt_masks=None, ori_shape=None, img_shape=None, + pad_shape=None, scale_factor=None, flip=None, flip_direction=None): + if self.training: + # pack inputs + img_metas = [] + h, w = img.shape[-2:] + batch_input_shape = (h, w) + gt_bboxes_list = [] + gt_label_list = [] + gt_masks_list = [] + for idx in range(img.shape[0]): + img_meta = { + 'ori_shape': ori_shape[idx], + 'img_shape': img_shape[idx], + 'pad_shape': pad_shape[idx], + 'scale_factor': scale_factor[idx], + 'flip': flip[idx], + 'flip_direction': flip_direction[idx], + 'batch_input_shape': batch_input_shape + } + img_metas.append(img_meta) + num_ins = (gt_label[idx] == 0).sum().astype(ms.int64) + gt_bboxes_list.append(gt_bboxes[idx, :num_ins]) + gt_label_list.append(gt_label[idx, :num_ins]) + gt_masks_list.append(BitmapMasks(gt_masks[idx, :num_ins], h, w)) + return self.forward_train(img, img_metas, + gt_bboxes=gt_bboxes_list, + gt_labels=gt_label_list, + gt_masks=gt_masks_list) + else: + if self.is_model_export: + return self.model_export(img) + else: + # pack inputs + img_metas = [] + h, w = img.shape[-2:] + batch_input_shape = (h, w) + for idx in range(img.shape[0]): + img_meta = { + 'ori_shape': ori_shape[idx], + 'img_shape': img_shape[idx], + 'pad_shape': pad_shape[idx], + 'scale_factor': scale_factor[idx], + 'batch_input_shape': batch_input_shape + } + img_metas.append(img_meta) + return self.simple_test(img, img_metas, True) + + def model_export(self, img): + # pack fake inputs + img_metas = [] + h, w = img.shape[-2:] + batch_input_shape = (h, w) + for idx in range(img.shape[0]): + img_meta = { + 'ori_shape': img.shape[1:], + 'img_shape': img.shape[1:], + 'pad_shape': img.shape[1:], + 'scale_factor': [1, 1], + 'batch_input_shape': batch_input_shape + } + img_metas.append(img_meta) + + x = self.extract_feat(img) + proposal_feats, x_feats, mask_preds, cls_scores, seg_preds = self.rpn_head.onnx_export(x) + + scaled_mask_preds, cls_score = self.roi_head.onnx_export(x_feats, + proposal_feats, + mask_preds, + cls_scores, + img_metas, + ) + return scaled_mask_preds, cls_score + + +class TrainModelWrapper(nn.Cell): + + def __init__(self, network): + super(TrainModelWrapper, self).__init__() + self.network = network + self.network.set_train() + self.trainable_params = network.trainable_params() + self.weights = ms.ParameterTuple(self.trainable_params) + self.optimizer = Adam(self.trainable_params, learning_rate=0.0001, eps=1e-8) + self.hyper_map = ops.HyperMap() + self.grad = ops.GradOperation(get_by_list=True) + self.reducer_flag = False + self.grad_reducer = None + self.parallel_mode = ms.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + if self.reducer_flag: + mean = ms.get_auto_parallel_context("gradients_mean") + if auto_parallel_context().get_device_num_is_set(): + degree = ms.get_auto_parallel_context("device_num") + else: + degree = get_group_size() + self.grad_reducer = nn.DistributedGradReducer( + self.optimizer.parameters, mean, degree) + + def construct(self, *args, **kwargs): + total_loss = self.network(*args, **kwargs) + grads = self.grad(self.network, self.weights)(*args, **kwargs) + if self.reducer_flag: + grads = self.grad_reducer(grads) + return ops.depend(total_loss, self.optimizer(grads)) diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/fpn_neck.py b/contrib/Overlap-Recovery/train/src/deoccluder/fpn_neck.py new file mode 100644 index 0000000000000000000000000000000000000000..430a0193f913c58c0dc3152e7be97ed6a0b71274 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/fpn_neck.py @@ -0,0 +1,127 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Feature pyramid network. (inherited from MaskRCNN in model zoo)""" + + +import numpy as np +import mindspore.nn as nn +from mindspore.ops import operations as P +from mindspore.common.tensor import Tensor +from mindspore.common import dtype as mstype +from mindspore.common.initializer import initializer +from mindspore import context + + +def bias_init_zeros(shape): + """Bias init method.""" + return Tensor(np.array(np.zeros(shape).astype(np.float32)), dtype=mstype.float32) + + +def _conv(in_channels, out_channels, kernel_size=3, **kwargs): + """Conv2D wrapper.""" + stride = kwargs.get('stride', 1) + padding = kwargs.get('padding', 0) + pad_mode = kwargs.get('pad_mode', 'pad') + shape = (out_channels, in_channels, kernel_size, kernel_size) + weights = initializer("XavierUniform", shape=shape, dtype=mstype.float32) + shape_bias = (out_channels,) + biass = bias_init_zeros(shape_bias) + return nn.Conv2d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + pad_mode=pad_mode, weight_init=weights, has_bias=True, bias_init=biass) + + +class FeatPyramidNeck(nn.Cell): + """ + Feature pyramid network cell, usually uses as network neck. + + Applies the convolution on multiple, input feature maps + and output feature map with same channel size. if required num of + output larger then num of inputs, add extra maxpooling for further + downsampling; + + Args: + in_channels (tuple) - Channel size of input feature maps. + out_channels (int) - Channel size output. + num_outs (int) - Num of output features. + + Returns: + Tuple, with tensors of same channel size. + + Examples: + neck = FeatPyramidNeck([100,200,300], 50, 4, config.feature_shapes) + input_data = (normal(0,0.1,(1,c,1280//(4*2**i), 768//(4*2**i)), + dtype=np.float32) \ + for i, c in enumerate(config.fpn_in_channels)) + x = neck(input_data) + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + feature_shapes): + super(FeatPyramidNeck, self).__init__() + + if context.get_context("device_target") == "Ascend": + self.cast_type = mstype.float16 + else: + self.cast_type = mstype.float32 + + self.num_outs = num_outs + self.in_channels = in_channels + self.fpn_layer = len(self.in_channels) + + assert not self.num_outs < len(in_channels) + + self.lateral_convs_list_ = [] + self.fpn_convs_ = [] + + for _, channel in enumerate(in_channels): + l_conv = _conv(channel, out_channels, kernel_size=1, stride=1, + padding=0, pad_mode='valid').to_float(self.cast_type) + fpn_conv = _conv(out_channels, out_channels, kernel_size=3, stride=1, + padding=0, pad_mode='same').to_float(self.cast_type) + self.lateral_convs_list_.append(l_conv) + self.fpn_convs_.append(fpn_conv) + self.lateral_convs_list = nn.layer.CellList(self.lateral_convs_list_) + self.fpn_convs_list = nn.layer.CellList(self.fpn_convs_) + self.interpolate1 = P.ResizeBilinear(feature_shapes[2]) + self.interpolate2 = P.ResizeBilinear(feature_shapes[1]) + self.interpolate3 = P.ResizeBilinear(feature_shapes[0]) + self.cast = P.Cast() + self.maxpool = P.MaxPool(kernel_size=1, strides=2, pad_mode="same") + + def construct(self, inputs): + x = () + for i in range(self.fpn_layer): + x += (self.lateral_convs_list[i](inputs[i]),) + + y = (x[3],) + y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.cast_type),) + y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.cast_type),) + y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.cast_type),) + + z = () + for i in range(self.fpn_layer - 1, -1, -1): + z = z + (y[i],) + + outs = () + for i in range(self.fpn_layer): + outs = outs + (self.fpn_convs_list[i](z[i]),) + + for i in range(self.num_outs - self.fpn_layer): + outs = outs + (self.maxpool(outs[3]),) + return outs diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/resnet.py b/contrib/Overlap-Recovery/train/src/deoccluder/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f43886baaf594b1aa48b8fc92b7711171d85347f --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/resnet.py @@ -0,0 +1,137 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +resnet-50 backbone, code inherited from model zoo. +""" + + +import mindspore.nn as nn +from mindspore.common import initializer +from mindspore import Parameter +import mindspore +from mindspore import load_checkpoint, load_param_into_net +from src.model_utils.configs.config_base import config + + +class Bottleneck(nn.Cell): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, has_bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, + has_bias=False, pad_mode='pad') + self.bn2 = nn.BatchNorm2d(planes) + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, has_bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + + def construct(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Cell): + """ + A ResNet-50 model without final fully connected layer + """ + def __init__(self, block, layers): + self.inplanes = 64 + super(ResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, has_bias=False, pad_mode='pad') + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.avgpool = nn.AvgPool2d(kernel_size=7, stride=1) + self.pad = nn.Pad(paddings=((0, 0), (0, 0), (1, 1), (1, 1)), mode='CONSTANT') + + for m in self.cells(): + if isinstance(m, nn.Conv2d): + m.weight = Parameter(initializer.initializer( + init=initializer.HeNormal(mode='fan_out', nonlinearity='relu'), + shape=m.weight.shape, dtype=mindspore.float32), name=m.weight.name) + + def construct(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.pad(x) + x = self.maxpool(x) + + c2 = self.layer1(x) + c3 = self.layer2(c2) + c4 = self.layer3(c3) + c5 = self.layer4(c4) + results = (c2, c3, c4, c5) + return results + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.SequentialCell([ + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, has_bias=False), + nn.BatchNorm2d(planes * block.expansion) + ]) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.SequentialCell(*layers) + + +def resnet50(pretrained=True, **kwargs): + """Constructs a ResNet-50 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + if pretrained: + param_dict = load_checkpoint(config.pretrained_r50) + load_param_into_net(model, param_dict) + + return model diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/roi/__init__.py b/contrib/Overlap-Recovery/train/src/deoccluder/roi/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4f96c15807d6ba6a00c589cc2181e8677d1c44dd --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/roi/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/roi/custom_kernel_iter_head.py b/contrib/Overlap-Recovery/train/src/deoccluder/roi/custom_kernel_iter_head.py new file mode 100644 index 0000000000000000000000000000000000000000..65f54521f5592cbf0b5defce5804cc7ee3adef84 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/roi/custom_kernel_iter_head.py @@ -0,0 +1,346 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import mindspore as ms +from mindspore import nn, ops +import numpy as np + +from .custom_kernel_update_head import CustomKernelUpdateHead +from ..custom_cells import build_assigner, build_sampler + + +class CustomKernelIterHead(nn.Cell): + + def __init__(self, num_stages=6, proposal_feature_channel=256, mask_head=None, **kwargs): + super(CustomKernelIterHead, self).__init__() + if isinstance(mask_head, type(None)): + mask_head = dict() + num_proposals = kwargs.get('num_proposals', 100) + train_cfg = kwargs.get('train_cfg', None) + test_cfg = kwargs.get('test_cfg', None) + mask_assign_stride = kwargs.get('mask_assign_stride', 4) + stage_loss_weights = kwargs.get('stage_loss_weights', None) + assert len(stage_loss_weights) == num_stages + self.num_stages = num_stages + self.stage_loss_weights = stage_loss_weights + self.proposal_feature_channel = proposal_feature_channel + self.merge_cls_scores = False + self.recursive = False + self.post_assign = False + self.mask_out_stride = 4 + self.hard_target = False + self.assign_stages = 5 + self.num_thing_classes = 80 + self.mask_assign_stride = mask_assign_stride + self.num_proposals = num_proposals + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.interpolate = nn.ResizeBilinear() + if mask_head is not None: + self.init_mask_head(None, mask_head) + self.init_assigner_sampler() + self.init_weights() + + @property + def apply_kernel_occlusion(self): + return self.mask_head[0].apply_kernel_occlusion + + @property + def occ_pair_num(self): + return 2 * self.mask_head[0].pair_num + + def init_weights(self): + for i in range(self.num_stages): + self.mask_head[i].init_weights() + + def init_assigner_sampler(self): + """Initialize assigner and sampler for each stage.""" + self.mask_assigner = [] + self.mask_sampler = [] + if self.train_cfg is not None: + for idx, rcnn_train_cfg in enumerate(self.train_cfg): + self.mask_assigner.append( + build_assigner(rcnn_train_cfg['assigner'])) + self.current_stage = idx + self.mask_sampler.append( + build_sampler(rcnn_train_cfg['sampler'])) + + def init_mask_head(self, mask_roi_extractor, mask_head): + """Initialize mask head and mask roi extractor. + + Args: + mask_roi_extractor (dict): Config of mask roi extractor. + mask_head (dict): Config of mask in mask head. + """ + self.mask_head = nn.CellList() + if not isinstance(mask_head, list): + mask_head = [mask_head for _ in range(self.num_stages)] + assert len(mask_head) == self.num_stages + for head in mask_head: + self.mask_head.append(CustomKernelUpdateHead(**head)) + if self.recursive: + for i in range(self.num_stages): + self.mask_head[i] = self.mask_head[0] + + def construct(self, *inputs, **kwargs): + if self.training: + return self.forward_train(*inputs, **kwargs) + else: + return self.simple_test(*inputs, **kwargs) + + def forward_train(self, + x, + proposal_feats, + mask_preds, + **kwargs): + cls_score = kwargs.get('cls_score', None) + imgs_whwh = kwargs.get('imgs_whwh', None) + gt_sem_seg = kwargs.get('gt_sem_seg', None) + gt_sem_cls = kwargs.get('gt_sem_cls', None) + img_metas = kwargs.get('img_metas', None) + gt_masks = kwargs.get('gt_masks', None) + gt_labels = kwargs.get('gt_labels', None) + num_imgs = len(img_metas) + if self.mask_head[0].mask_upsample_stride > 1: + interpolate = nn.ResizeBilinear() + prev_mask_preds = interpolate( + ops.stop_gradient(mask_preds), + scale_factor=self.mask_head[0].mask_upsample_stride, + align_corners=False) + else: + prev_mask_preds = ops.stop_gradient(mask_preds) + + if cls_score is not None: + prev_cls_score = ops.stop_gradient(cls_score) + else: + prev_cls_score = [None] * num_imgs + + if self.hard_target: + gt_masks = [x.bool().astype(ms.float32) for x in gt_masks] + else: + gt_masks = gt_masks + + object_feats = proposal_feats + all_stage_loss = {} + all_stage_mask_results = [] + assign_results = [] + for stage in range(self.num_stages): + mask_results = self._mask_forward(x, object_feats, + mask_preds, + stage=stage, + img_metas=img_metas) + all_stage_mask_results.append(mask_results) + if self.apply_kernel_occlusion: + try: + mask_preds = mask_results['mask_preds'][:, :-self.occ_pair_num] + except KeyError: + raise KeyError + else: + try: + mask_preds = mask_results['mask_preds'] + except KeyError: + raise KeyError + scaled_mask_preds = mask_results.get('scaled_mask_preds', None) + cls_score = mask_results.get('cls_score', None) + object_feats = mask_results.get('object_feats', None) + + if self.post_assign: + if self.apply_kernel_occlusion: + prev_mask_preds = ops.stop_gradient(scaled_mask_preds[:, :-self.occ_pair_num]) + else: + prev_mask_preds = ops.stop_gradient(scaled_mask_preds) + prev_cls_score = ops.stop_gradient(cls_score) + + sampling_results = [] + if stage < self.assign_stages: + assign_results = [] + for i in range(num_imgs): + if stage < self.assign_stages: + mask_for_assign = prev_mask_preds[i][:self.num_proposals] + if prev_cls_score[i] is not None: + cls_for_assign = prev_cls_score[ + i][:self.num_proposals, :self.num_thing_classes] + else: + cls_for_assign = None + assign_result = self.mask_assigner[stage].assign( + mask_for_assign, cls_for_assign, gt_masks[i], + gt_labels[i], img_metas[i]) + assign_results.append(assign_result) + if self.apply_kernel_occlusion: + sampling_result = self.mask_sampler[stage].sample( + assign_results[i], scaled_mask_preds[i, :-self.occ_pair_num], gt_masks[i]) + else: + sampling_result = self.mask_sampler[stage].sample( + assign_results[i], scaled_mask_preds[i], gt_masks[i]) + sampling_results.append(sampling_result) + mask_targets = self.mask_head[stage].get_targets( + sampling_results, + gt_masks, + gt_labels, + self.train_cfg[stage], + True, + gt_sem_seg=gt_sem_seg, + gt_sem_cls=gt_sem_cls) + + single_stage_loss = self.mask_head[stage].loss( + object_feats, + cls_score, + scaled_mask_preds, + *mask_targets, + imgs_whwh=imgs_whwh) + for key, value in single_stage_loss.items(): + all_stage_loss[f's{stage}_{key}'] = value * \ + self.stage_loss_weights[stage] + + if not self.post_assign: + if self.apply_kernel_occlusion: + prev_mask_preds = ops.stop_gradient(scaled_mask_preds[:, :-self.occ_pair_num]) + else: + prev_mask_preds = ops.stop_gradient(scaled_mask_preds) + prev_cls_score = ops.stop_gradient(cls_score) + + return all_stage_loss + + def simple_test(self, + x, + proposal_feats, + mask_preds, + cls_score, + img_metas, + **kwargs): + + # Decode initial proposals + num_imgs = len(img_metas) + + object_feats = proposal_feats + scaled_mask_preds = None + for stage in range(self.num_stages): + mask_results = self._mask_forward(x, object_feats, + mask_preds, + stage=stage, + img_metas=img_metas) + object_feats = mask_results.get('object_feats', None) + cls_score = mask_results.get('cls_score', None) + mask_preds = mask_results.get('mask_preds', None) + scaled_mask_preds = mask_results.get('scaled_mask_preds', None) + + num_classes = self.mask_head[-1].num_classes + results = [] + + if self.mask_head[-1].loss_cls.use_sigmoid: + cls_score = cls_score.sigmoid() + else: + cls_score = cls_score.softmax(-1)[..., :-1] + + for img_id in range(num_imgs): + cls_score_per_img = cls_score[img_id] + scores_per_img, topk_indices = ops.TopK(sorted=True)( + cls_score_per_img.view(-1), self.test_cfg['max_per_img']) + mask_indices = topk_indices // num_classes + labels_per_img = topk_indices % num_classes + masks_per_img = scaled_mask_preds[img_id][mask_indices] + single_result = self.mask_head[-1].get_seg_masks( + masks_per_img, labels_per_img, scores_per_img, + self.test_cfg, img_metas[img_id]) + results.append(single_result) + return results + + def onnx_export(self, + x, + proposal_feats, + mask_preds, + cls_score, + img_metas, + ): + + + # Decode initial proposals + num_imgs = len(img_metas) + + object_feats = proposal_feats + scaled_mask_preds = None + for stage in range(self.num_stages): + cls_score, mask_preds, scaled_mask_preds, object_feats = self._mask_forward_export(stage, x, object_feats, + mask_preds, img_metas) + + cls_score = ms.ops.sigmoid(cls_score) + + # Resale scaled_mask_preds to batched img shape, (B, num_det, H/4, W/4) -> (B, num_det, H, W) + scaled_mask_preds = self.interpolate( + ms.ops.sigmoid(scaled_mask_preds), + size=(768, 768), # hard code + align_corners=False) + + return scaled_mask_preds, cls_score + + def segm2result_onnx(self, mask_preds, det_labels, cls_scores): + + segm_result = [] + seg_scores = [] + + mask_preds = mask_preds.detach() # num_det, h,w + det_labels = det_labels.detach() # class id + cls_scores = cls_scores.detach() + + num_ins = mask_preds.shape[0] # num_dets, h, w + for idx in range(num_ins): + segm_result.append(mask_preds[idx]) + seg_scores.append(cls_scores[idx]) + # here we only have one classes (text) + segm_result = ms.ops.stack(segm_result) # num_det, h, w + seg_scores = ms.ops.stack(seg_scores) # num_det + + return segm_result, seg_scores + + def _mask_forward(self, x, object_feats, mask_preds, **kwargs): + stage = kwargs.get('stage', None) + img_metas = kwargs.get('img_metas', None) + mask_head = self.mask_head[stage] + cls_score, mask_preds, object_feats = mask_head( + x, object_feats, mask_preds, img_metas=img_metas) + if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 + or self.training): + interpolate = nn.ResizeBilinear() + scaled_mask_preds = interpolate( + mask_preds, + scale_factor=mask_head.mask_upsample_stride, + align_corners=False) + else: + scaled_mask_preds = mask_preds + mask_results = dict( + cls_score=cls_score, + mask_preds=mask_preds, + scaled_mask_preds=scaled_mask_preds, + object_feats=object_feats) + + return mask_results + + def _mask_forward_export(self, stage, x, object_feats, mask_preds, img_metas): + mask_upsample_stride = 2 + mask_head = self.mask_head[stage] + cls_score, mask_preds, object_feats = mask_head( + x, object_feats, mask_preds, img_metas=img_metas) + if mask_upsample_stride > 1 and (stage == self.num_stages - 1 + or self.training): + interpolate = nn.ResizeBilinear() + scaled_mask_preds = interpolate( + mask_preds, + scale_factor=mask_upsample_stride, + align_corners=False) + else: + scaled_mask_preds = mask_preds + results = (cls_score, mask_preds, scaled_mask_preds, object_feats) + return results diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/roi/custom_kernel_update_head.py b/contrib/Overlap-Recovery/train/src/deoccluder/roi/custom_kernel_update_head.py new file mode 100644 index 0000000000000000000000000000000000000000..bb5da55dc1d7a8c3ef7b64975af78a347b687525 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/roi/custom_kernel_update_head.py @@ -0,0 +1,307 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import mindspore as ms +from mindspore import nn, ops + +from .kernel_update_head import KernelUpdateHead +from ..custom_cells import build_loss + + +class CustomKernelUpdateHead(KernelUpdateHead): + def __init__(self, apply_kernel_occlusion=False, kernel_occlusion_cfg=None, **kwargs): + super(CustomKernelUpdateHead, self).__init__(**kwargs) + self.apply_kernel_occlusion = apply_kernel_occlusion + if apply_kernel_occlusion: + self.init_kernel_occlusion(kernel_occlusion_cfg) + + def init_kernel_occlusion(self, kernel_occlusion_cfg): + # prepare config + self.num_proposals = kernel_occlusion_cfg.get('num_proposals') + assert self.num_proposals >= 2 + self.pair_list = [] + for ii in range(self.num_proposals - 1): + for jj in range(ii + 1, self.num_proposals): + self.pair_list.append(ii) + self.pair_list.append(jj) + self.pair_num = len(self.pair_list) // 2 + self.pair_manner = kernel_occlusion_cfg.get('pair_manner', 'sum') + print(f"Manner of merging kernel pair: {self.pair_manner}") + assert self.pair_manner in ['sum', 'cat'] + # prepare layer and init weights + if self.pair_manner == 'sum': + self.union_fc = nn.Dense(self.in_channels, self.in_channels) + self.interact_fc = nn.Dense(self.in_channels, self.in_channels) + else: + self.union_fc = nn.Dense(2 * self.in_channels, self.in_channels) + self.interact_fc = nn.Dense(2 * self.in_channels, self.in_channels) + self.apply_occ_union = kernel_occlusion_cfg['u_mask_loss']['loss_weight'] > 0 or \ + kernel_occlusion_cfg['u_dice_loss']['loss_weight'] > 0 + self.occ_union_mask_loss = build_loss(kernel_occlusion_cfg.get('u_mask_loss').copy()) + self.occ_interact_mask_loss = build_loss(kernel_occlusion_cfg.get('i_mask_loss').copy()) + self.occ_union_dice_loss = build_loss(kernel_occlusion_cfg.get('u_dice_loss').copy()) + self.occ_interact_dice_loss = build_loss(kernel_occlusion_cfg.get('i_dice_loss').copy()) + + def kernel_occlusion(self, obj_feat): + """ + Apply Kernel Occlusion operation. + + :param + :obj_feat : Tensor with shape (B, N, K * K, C), where K is convolution kernel size + """ + b, n, _, c = obj_feat.shape + # B, N, K * K, C -> B, N, K * K * C + kernels = obj_feat.reshape(b, n, -1) + assert n == self.num_proposals + # B, pair_num, 2, K * K * C + kernel_pairs = kernels[:, self.pair_list].reshape(b, self.pair_num, 2, -1) + if self.pair_manner == 'sum': + # B, pair_num, K * K * C + kernel_pairs = kernel_pairs.sum(axis=2) + else: + # B, pair_num, 2 * K * K * C + kernel_pairs = kernel_pairs.reshape(b, self.pair_num, -1) + # union and interact kernels + # B, 2 * pair_num, K * K * C -> B, 2 * pair_num, K * K, C + ui_kernels = ops.concat([ + self.union_fc(kernel_pairs), self.interact_fc(kernel_pairs) + ], axis=1).reshape(b, 2 * self.pair_num, -1, c) + + return ui_kernels + + def construct(self, x, proposal_feat, mask_preds, **kwargs): + mask_shape = kwargs.get('mask_shape', None) + n_sample, num_proposals = proposal_feat.shape[:2] + if self.feat_transform is not None: + x = self.feat_transform(x) + chn, height, width = x.shape[-3:] + + mask_h, mask_w = mask_preds.shape[-2:] + if mask_h != height or mask_w != width: + gather_mask = self.interpolate( + mask_preds, size=(height, width), align_corners=False) + else: + gather_mask = mask_preds + + + sigmoid_masks = ms.ops.sigmoid(gather_mask) + nonzero_inds = sigmoid_masks > self.hard_mask_thr + sigmoid_masks = nonzero_inds.astype(ms.float32) + + # einsum is faster than bmm by 30% + b, n, h, w = sigmoid_masks.shape + _, c, _, _ = x.shape + sigmoid_masks = ms.ops.reshape(sigmoid_masks, (b, n, h*w)) + tmp_x_feats = ms.ops.reshape(x, (b, c, h*w)) + tmp_x_feats = ms.ops.transpose(tmp_x_feats, (0, 2, 1)) + x_feat = ms.ops.bmm(sigmoid_masks, tmp_x_feats) + + # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C] + proposal_feat = proposal_feat.reshape(n_sample, num_proposals, + self.in_channels, + -1).transpose(0, 1, 3, 2) + obj_feat = self.kernel_update_conv(x_feat, proposal_feat) + + # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C] + obj_feat = obj_feat.reshape(n_sample, num_proposals, -1).transpose(1, 0, 2) + obj_feat = self.attention_norm(self.attention(obj_feat)) + # [N, B, K*K*C] -> [B, N, K*K*C] + obj_feat = obj_feat.transpose(1, 0, 2) + + # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C] + obj_feat = obj_feat.reshape(n_sample, num_proposals, -1, self.in_channels) + + # FFN + if self.with_ffn: + obj_feat = self.ffn_norm(self.ffn(obj_feat)) + + cls_feat = obj_feat.sum(-2) + mask_feat = obj_feat + + ui_pair_num = None + if self.apply_kernel_occlusion and self.training: + ui_kernels = self.kernel_occlusion(obj_feat) + ui_pair_num = ui_kernels.shape[1] + mask_feat = ops.concat([mask_feat, ui_kernels], axis=1) + + for cls_layer in self.cls_fcs: + cls_feat = cls_layer(cls_feat) + for reg_layer in self.mask_fcs: + mask_feat = reg_layer(mask_feat) + + cls_score = self.fc_cls(cls_feat).view(n_sample, num_proposals, -1) + # [B, N, K*K, C] -> [B, N, C, K*K] + mask_feat = self.fc_mask(mask_feat).transpose(0, 1, 3, 2) + + if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1): + mask_x = self.interpolate( + x, scale_factor=0.5, align_corners=False) + height, width = mask_x.shape[-2:] + else: + mask_x = x + # [B, N, C, K*K] -> [B*N, C, K, K] + if self.apply_kernel_occlusion and self.training: + tmp_num = num_proposals + ui_pair_num + mask_feat = mask_feat.reshape(n_sample, tmp_num, chn, + self.conv_kernel_size, + self.conv_kernel_size) + else: + mask_feat = mask_feat.reshape(n_sample, num_proposals, chn, + self.conv_kernel_size, + self.conv_kernel_size) + # [B, C, H, W] -> [1, B*C, H, W] + new_mask_preds = [] + for i in range(n_sample): + new_mask_preds.append( + ops.conv2d( + mask_x[i:i + 1], + mask_feat[i], + padding=int(self.conv_kernel_size // 2))) + + new_mask_preds = ops.concat(new_mask_preds, axis=0) + if self.apply_kernel_occlusion and self.training: + new_mask_preds = new_mask_preds.reshape(n_sample, num_proposals + ui_pair_num, height, width) + else: + new_mask_preds = new_mask_preds.reshape(n_sample, num_proposals, height, width) + if self.mask_transform_stride == 2: + new_mask_preds = self.interpolate( + new_mask_preds, + scale_factor=2, + align_corners=False) + + if mask_shape is not None and mask_shape[0] != height: + new_mask_preds = self.interpolate( + new_mask_preds, + size=mask_shape, + mode='bilinear') + + return cls_score, new_mask_preds, obj_feat.transpose(0, 1, 3, 2).reshape( + n_sample, num_proposals, self.in_channels, self.conv_kernel_size, + self.conv_kernel_size) + + def loss(self, + object_feats, + cls_score, + mask_pred, + labels, + label_weights, + mask_targets, + mask_weights, + imgs_whwh=None, + reduction_override=None, + **kwargs): + if not self.apply_kernel_occlusion: + return super(CustomKernelUpdateHead, self).loss( + object_feats, + cls_score, + mask_pred, + labels, + label_weights, + mask_targets, + mask_weights, + imgs_whwh=imgs_whwh, + reduction_override=reduction_override, + **kwargs + ) + assert mask_pred.shape[1] > 2 * self.pair_num + losses = super(CustomKernelUpdateHead, self).loss( + object_feats, + cls_score, + mask_pred[:, :-2 * self.pair_num], + labels, + label_weights, + mask_targets, + mask_weights, + imgs_whwh=imgs_whwh, + reduction_override=reduction_override, + **kwargs + ) + b, _, h, w = mask_pred.shape + occ_mask_pred = mask_pred[:, -2 * self.pair_num:] + # determine positive indexes + bg_class_ind = self.num_classes + # note in spare rcnn num_gt == num_pos + pos_inds = (labels >= 0).astype(ms.int32) & (labels < bg_class_ind).astype(ms.int32) + pos_inds = pos_inds.reshape(b, -1) + mask_targets = mask_targets.reshape(b, -1, h, w) + # select gt pairs + pred_union_inds = [] + pred_interact_inds = [] + occ_union_targets = [] + occ_interact_targets = [] + for batch_idx in range(b): + num_valid = pos_inds[batch_idx].sum().asnumpy().item() + if num_valid <= 1: + continue + valid_inds = ops.nonzero(pos_inds[batch_idx]).view(-1).asnumpy().tolist() + valid_inds = sorted(valid_inds) + valid_target_pairs = [] + union_pred_pair = [] + iteract_pred_pair = [] + for ii in range(num_valid - 1): + for jj in range(ii + 1, num_valid): + valid_target_pairs.append(ii) + valid_target_pairs.append(jj) + # get corresponding index in pair list + a, b = valid_inds[ii], valid_inds[jj] + idx_in_pair = (self.num_proposals - 1 + self.num_proposals - a) * a // 2 + b - a - 1 + union_pred_pair.append([batch_idx, idx_in_pair]) + iteract_pred_pair.append([batch_idx, idx_in_pair + self.pair_num]) + # check if this code contain bug + candidate_pair_list = np.array(self.pair_list).reshape(-1, 2) + assert candidate_pair_list[idx_in_pair][0] == a and candidate_pair_list[idx_in_pair][1] == b + # union_of_img1, interact_of_img1, union_of_img2, ... + pred_union_inds += union_pred_pair + pred_interact_inds += iteract_pred_pair + # prepare gt + # num_pair, 2, h, w -> we apply hard target for occlusion target + mask_target = mask_targets[batch_idx, ms.Tensor(np.nonzero(pos_inds[batch_idx].asnumpy())[0])][ + valid_target_pairs].reshape(-1, 2, h, w).astype(ms.bool_) + # 2 * num_pair, h, w + # union_wo_interaction, interaction area + union_area = mask_target[:, 0].astype(ms.int32) | mask_target[:, 1].astype(ms.int32) + interaction_area = mask_target[:, 0].astype(ms.int32) & mask_target[:, 1].astype(ms.int32) + # union without interaction area + occ_union_targets.append(union_area) + occ_interact_targets.append(interaction_area) + if len(occ_interact_targets) == 0: + losses.update(loss_occ_mask=occ_mask_pred.sum() * 0, + loss_occ_dice=occ_mask_pred.sum() * 0) + return losses + # select prediction + occ_union_targets = ops.concat(occ_union_targets, axis=0).astype(ms.float32) + occ_interact_targets = ops.concat(occ_interact_targets, axis=0).astype(ms.float32) + occ_union_preds = occ_mask_pred[[x[0] for x in pred_union_inds], + [x[1] for x in pred_union_inds]] + occ_interact_preds = occ_mask_pred[[x[0] for x in pred_interact_inds], + [x[1] for x in pred_interact_inds]] + if self.apply_occ_union: + loss_occ_union_mask = self.occ_union_mask_loss(occ_union_preds, occ_union_targets) + loss_occ_union_dice = self.occ_union_dice_loss(occ_union_preds, occ_union_targets) + loss_occ_interact_mask = self.occ_interact_mask_loss(occ_interact_preds, occ_interact_targets) + loss_occ_interact_dice = self.occ_interact_dice_loss(occ_interact_preds, occ_interact_targets) + losses.update( + loss_occ_mask=0.5*(loss_occ_union_mask+loss_occ_interact_mask), + loss_occ_dice=0.5*(loss_occ_union_dice+loss_occ_interact_dice) + ) + else: + losses.update( + loss_occ_mask=self.occ_interact_mask_loss(occ_interact_preds, occ_interact_targets), + loss_occ_dice=self.occ_interact_dice_loss(occ_interact_preds, occ_interact_targets) + ) + + return losses diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/roi/kernel_update_head.py b/contrib/Overlap-Recovery/train/src/deoccluder/roi/kernel_update_head.py new file mode 100644 index 0000000000000000000000000000000000000000..138dd421b64f86d6f8820a968236be1456acb45a --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/roi/kernel_update_head.py @@ -0,0 +1,343 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np + +import mindspore as ms +from mindspore import nn, ops +from mindspore.common import initializer as init +from mindspore.communication.management import GlobalComm, get_group_size + +from ..custom_cells import (build_loss, multi_apply, ConvModule, FFN, MultiheadAttention) +from .kernel_updator import KernelUpdator + + +class KernelUpdateHead(nn.Cell): + + def __init__(self, loss_dice=None, loss_cls=None, num_proposals=4, **kwargs): + super(KernelUpdateHead, self).__init__() + # load arguments + num_classes = kwargs.get('num_classes', 80) + num_ffn_fcs = kwargs.get('num_ffn_fcs', 2) + num_heads = kwargs.get('num_heads', 8) + num_cls_fcs = kwargs.get('num_cls_fcs', 1) + num_mask_fcs = kwargs.get('num_mask_fcs', 3) + feedforward_channels = kwargs.get('feedforward_channels', 2048) + in_channels = kwargs.get('in_channels', 256) + out_channels = kwargs.get('out_channels', 256) + dropout = kwargs.get('dropout', 0.0) + mask_thr = kwargs.get('mask_thr', 0.5) + ffn_act_cfg = kwargs.get('ffn_act_cfg', None) + conv_kernel_size = kwargs.get('conv_kernel_size', 3) + feat_transform_cfg = kwargs.get('feat_transform_cfg', None) + hard_mask_thr = kwargs.get('hard_mask_thr', 0.5) + kernel_init = kwargs.get('kernel_init', False) + with_ffn = kwargs.get('with_ffn', True) + mask_out_stride = kwargs.get('mask_out_stride', 4) + mask_upsample_stride = kwargs.get('mask_upsample_stride', 1) + mask_assign_stride = kwargs.get('mask_assign_stride', 4) + kernel_updator_cfg = kwargs.get('kernel_updator_cfg', None) + loss_mask = kwargs.get('loss_mask', None) + + # init dict-like arguments + if isinstance(ffn_act_cfg, type(None)): + ffn_act_cfg = dict(type='ReLU', inplace=True) + if isinstance(kernel_updator_cfg, type(None)): + kernel_updator_cfg = dict() + if isinstance(loss_mask, type(None)): + loss_mask = dict(type='CrossEntropyLoss', use_mask=True, loss_weight=1.0) + if isinstance(loss_dice, type(None)): + loss_dice = dict(type='DiceLoss', loss_weight=3.0) + if isinstance(loss_cls, type(None)): + loss_cls = dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, + alpha=0.25, loss_weight=2.0) + + self.num_classes = num_classes + self.loss_cls = build_loss(loss_cls) + self.loss_mask = build_loss(loss_mask) + self.loss_dice = build_loss(loss_dice) + + self.in_channels = in_channels + self.out_channels = out_channels + self.mask_thr = mask_thr + self.fp16_enabled = False + self.dropout = dropout + + self.num_heads = num_heads + self.hard_mask_thr = hard_mask_thr + self.kernel_init = kernel_init + self.with_ffn = with_ffn + self.mask_out_stride = mask_out_stride + self.relative_coors = False + self.relative_coors_off = False + self.conv_kernel_size = conv_kernel_size + self.feat_gather_stride = 1 + self.mask_transform_stride = 1 + self.mask_upsample_stride = mask_upsample_stride + + self.num_thing_classes = 80 + self.num_stuff_classes = 53 + self.mask_assign_stride = mask_assign_stride + self.ignore_label = 255 + self.thing_label_in_seg = 0 + + self.attention = MultiheadAttention(in_channels * conv_kernel_size**2, + num_heads, dropout, num_proposals=num_proposals) + # self.attention_norm = build_norm_layer( + # dict(type='LN'), in_channels * conv_kernel_size**2)[1] + self.attention_norm = nn.LayerNorm([in_channels * conv_kernel_size ** 2]) + + self.kernel_update_conv = KernelUpdator(**kernel_updator_cfg) + + if feat_transform_cfg is not None: + kernel_size = feat_transform_cfg.pop('kernel_size', 1) + self.feat_transform = ConvModule( + in_channels, + in_channels, + kernel_size, + stride=1, + padding=int(1 // 2), + **feat_transform_cfg) + else: + self.feat_transform = None + + if self.with_ffn: + self.ffn = FFN( + in_channels, + feedforward_channels, + num_ffn_fcs, + act_cfg=ffn_act_cfg, + dropout_layer=dropout) + self.ffn_norm = nn.LayerNorm([in_channels]) + + self.cls_fcs = nn.CellList() + for _ in range(num_cls_fcs): + self.cls_fcs.append( + nn.Dense(in_channels, in_channels, has_bias=False)) + self.cls_fcs.append( + nn.LayerNorm([in_channels])) + self.cls_fcs.append(nn.ReLU()) + + if self.loss_cls.use_sigmoid: + self.fc_cls = nn.Dense(in_channels, self.num_classes) + else: + self.fc_cls = nn.Dense(in_channels, self.num_classes + 1) + + self.mask_fcs = nn.CellList() + for _ in range(num_mask_fcs): + self.mask_fcs.append( + nn.Dense(in_channels, in_channels, has_bias=False)) + self.mask_fcs.append(nn.LayerNorm([in_channels])) + self.mask_fcs.append(nn.ReLU()) + + self.fc_mask = nn.Dense(in_channels, out_channels) + self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) + self.interpolate = nn.ResizeBilinear() + + def init_weights(self): + self.init_parameters_data() + for _, m in self.cells_and_names(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.set_data(ms.Tensor(np.random.normal(0, np.sqrt(2. / n), + m.weight.data.shape).astype("float32"))) + if m.bias is not None: + m.bias.set_data( + ms.Tensor(np.zeros(m.bias.data.shape, dtype="float32"))) + elif isinstance(m, nn.BatchNorm2d): + m.gamma.set_data( + ms.Tensor(np.ones(m.gamma.data.shape, dtype="float32"))) + m.beta.set_data( + ms.Tensor(np.zeros(m.beta.data.shape, dtype="float32"))) + elif isinstance(m, nn.Dense): + m.weight.set_data( + ms.Tensor(np.random.normal(0, 0.001, m.weight.data.shape).astype("float32"))) + if m.has_bias: + m.bias.set_data( + ms.Tensor(np.random.normal(0, 0.001, m.bias.data.shape).astype("float32"))) + if self.loss_cls.use_sigmoid: + self.fc_cls.bias.set_data(init.initializer(0.01, self.fc_cls.bias.shape)) + if self.kernel_init: + print('mask kernel in mask head is normal initialized by std 0.01') + self.fc_mask.weight.set_data(init.initializer( + init.Normal(0.01, 0), self.fc_mask.weight.shape)) + + def construct(self, *inputs, **kwargs): + raise NotImplementedError + + def loss(self, + object_feats, + cls_score, + mask_pred, + labels, + label_weights, + mask_targets, + mask_weights, + **kwargs): + losses = dict() + bg_class_ind = self.num_classes + # note in spare rcnn num_gt == num_pos + pos_inds = (labels >= 0).astype(ms.int32) & (labels < bg_class_ind).astype(ms.int32) + num_pos = pos_inds.sum().astype(ms.float32) + + num_preds = mask_pred.shape[0] * mask_pred.shape[1] + assert mask_pred.shape[0] == cls_score.shape[0] + assert mask_pred.shape[1] == cls_score.shape[1] + + if cls_score is not None: + get_size = ops.Size() + if get_size(cls_score) > 0: + avg_factor = labels.astype(ms.float32).asnumpy().sum() + losses['loss_cls'] = self.loss_cls( + cls_score.reshape(-1, 1), + labels.reshape(-1)).sum() / avg_factor + if mask_pred is not None: + bool_pos_inds = pos_inds.astype(ms.bool_) + # 0~self.num_classes-1 are FG, self.num_classes is BG + # do not perform bounding box regression for BG anymore. + height, width = mask_pred.shape[-2:] + if bool_pos_inds.any(): + candi_index = ops.nonzero(bool_pos_inds).squeeze(-1) + pos_mask_pred = mask_pred.reshape(num_preds, height, + width)[candi_index] + pos_mask_targets = mask_targets[candi_index] + losses['loss_mask'] = self.loss_mask(pos_mask_pred, + pos_mask_targets) + losses['loss_dice'] = self.loss_dice(pos_mask_pred, + pos_mask_targets) + else: + losses['loss_mask'] = mask_pred.sum() * 0 + losses['loss_dice'] = mask_pred.sum() * 0 + + return losses + + def get_targets(self, + sampling_results, + gt_mask, + gt_labels, + rcnn_train_cfg, + concat=True, + gt_sem_seg=None, + gt_sem_cls=None): + + pos_inds_list = [res.pos_inds for res in sampling_results] + neg_inds_list = [res.neg_inds for res in sampling_results] + pos_mask_list = [res.pos_masks for res in sampling_results] + neg_mask_list = [res.neg_masks for res in sampling_results] + pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] + pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] + if gt_sem_seg is None: + # me: fix hard-code bug + num_imgs = len(sampling_results) + gt_sem_seg = [None] * num_imgs + gt_sem_cls = [None] * num_imgs + + labels, label_weights, mask_targets, mask_weights = multi_apply( + self._get_target_single, + pos_inds_list, + neg_inds_list, + pos_mask_list, + neg_mask_list, + pos_gt_mask_list, + pos_gt_labels_list, + gt_sem_seg, + gt_sem_cls, + cfg=rcnn_train_cfg) + if concat: + labels = ops.concat(labels, 0) + label_weights = ops.concat(label_weights, 0) + mask_targets = ops.concat(mask_targets, 0) + mask_weights = ops.concat(mask_weights, 0) + results = (labels, label_weights, mask_targets, mask_weights) + return results + + def rescale_masks(self, masks_per_img, img_meta): + h, w, _ = img_meta['img_shape'] + expand_dims = ops.ExpandDims() + masks_per_img = self.interpolate( + ms.ops.sigmoid(expand_dims(masks_per_img, 0)), + size=img_meta['batch_input_shape'], + align_corners=False) + + masks_per_img = masks_per_img[:, :, :h, :w] + ori_shape = img_meta['ori_shape'] + seg_masks = self.interpolate( + ms.Tensor(masks_per_img.asnumpy()), + size=tuple(ori_shape[:2].asnumpy().tolist()), + align_corners=False).squeeze(0) + return seg_masks + + def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img, + test_cfg, img_meta): + # resize mask predictions back + seg_masks = self.rescale_masks(masks_per_img, img_meta) + seg_masks = seg_masks > test_cfg['mask_thr'] + bbox_result, segm_result = self.segm2result(seg_masks, labels_per_img, + scores_per_img) + return bbox_result, segm_result + + def segm2result(self, mask_preds, det_labels, cls_scores): + num_classes = self.num_classes + bbox_result = None + segm_result = [[] for _ in range(num_classes)] + mask_preds = mask_preds.asnumpy() + det_labels = det_labels.asnumpy() + cls_scores = cls_scores.asnumpy() + + num_ins = mask_preds.shape[0] + # fake bboxes + bboxes = np.zeros((num_ins, 5), dtype=np.float32) + bboxes[:, -1] = cls_scores + bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)] + for idx in range(num_ins): + segm_result[det_labels[idx]].append(mask_preds[idx]) + return bbox_result, segm_result + + def get_seg_masks_onnx(self, masks_per_img, + test_cfg, img_meta): + seg_masks = masks_per_img > 0.5 + return seg_masks + + def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, + pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, + cfg): + + num_pos = pos_mask.shape[0] + num_neg = neg_mask.shape[0] + num_samples = num_pos + num_neg + height, width = pos_mask.shape[-2:] + # original implementation uses new_zeros since BG are set to be 0 + # now use empty & fill because BG cat_id = num_classes, + # FG cat_id = [0, num_classes-1] + labels = ms.numpy.full((num_samples, ), + self.num_classes, + dtype=ms.int64) + new_zeros = ops.Zeros() + label_weights = new_zeros((num_samples, self.num_classes), pos_mask.dtype) + mask_targets = new_zeros((num_samples, height, width), pos_mask.dtype) + mask_weights = new_zeros((num_samples, height, width), pos_mask.dtype) + if num_pos > 0: + labels[pos_inds] = pos_gt_labels + pos_weight = 1.0 if cfg['pos_weight'] <= 0 else cfg['pos_weight'] + label_weights[pos_inds] = pos_weight + pos_mask_targets = pos_gt_mask + mask_targets[pos_inds] = pos_mask_targets + mask_weights[pos_inds] = 1 + + if num_neg > 0: + label_weights[neg_inds] = 1.0 + results = (labels, label_weights, mask_targets, mask_weights) + return results diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/roi/kernel_updator.py b/contrib/Overlap-Recovery/train/src/deoccluder/roi/kernel_updator.py new file mode 100644 index 0000000000000000000000000000000000000000..db8e82057cb09b556ba9bade94a08e2e091bcb0d --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/roi/kernel_updator.py @@ -0,0 +1,100 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import mindspore as ms +from mindspore import nn, ops + + +class KernelUpdator(nn.Cell): + + def __init__(self, in_channels=256, feat_channels=64, out_channels=None, act_cfg=None): + super(KernelUpdator, self).__init__() + if isinstance(act_cfg, type(None)): + act_cfg = dict(type='ReLU', inplace=True) + self.in_channels = in_channels + self.feat_channels = feat_channels + self.out_channels_raw = out_channels + self.gate_sigmoid = True + self.gate_norm_act = False + self.activate_out = False + input_feat_shape = 3 + if isinstance(input_feat_shape, int): + input_feat_shape = [input_feat_shape] * 2 + self.input_feat_shape = input_feat_shape + self.act_cfg = act_cfg + self.out_channels = out_channels if out_channels else in_channels + + self.num_params_in = self.feat_channels + self.num_params_out = self.feat_channels + self.dynamic_layer = nn.Dense( + self.in_channels, self.num_params_in + self.num_params_out) + self.input_layer = nn.Dense( + self.in_channels, self.num_params_in + self.num_params_out) + self.input_gate = nn.Dense(self.in_channels, self.feat_channels) + self.update_gate = nn.Dense(self.in_channels, self.feat_channels) + if self.gate_norm_act: + self.gate_norm = nn.LayerNorm([self.feat_channels]) + + self.norm_in = nn.LayerNorm([self.feat_channels]) + self.norm_out = nn.LayerNorm([self.feat_channels]) + self.input_norm_in = nn.LayerNorm([self.feat_channels]) + self.input_norm_out = nn.LayerNorm([self.feat_channels]) + + if act_cfg and act_cfg.get('type', 'None') == 'ReLU': + self.activation = nn.ReLU() + else: + self.activation = nn.Identity() + self.fc_layer = nn.Dense(self.feat_channels, self.out_channels) + self.fc_norm = nn.LayerNorm([self.out_channels]) + + def construct(self, update_feature, input_feature): + update_feature = update_feature.reshape(-1, self.in_channels) + num_proposals = update_feature.shape[0] + parameters = self.dynamic_layer(update_feature) + param_in = parameters[:, :self.num_params_in].view( + -1, self.feat_channels) + param_out = parameters[:, -self.num_params_out:].view( + -1, self.feat_channels) + input_feats = self.input_layer( + input_feature.reshape(num_proposals, -1, self.feat_channels)) + input_in = input_feats[..., :self.num_params_in] + input_out = input_feats[..., -self.num_params_out:] + + expand_dims = ops.ExpandDims() + gate_feats = input_in * expand_dims(param_in, -2) + if self.gate_norm_act: + gate_feats = self.activation(self.gate_norm(gate_feats)) + + input_gate = self.input_norm_in(self.input_gate(gate_feats)) + update_gate = self.norm_in(self.update_gate(gate_feats)) + if self.gate_sigmoid: + input_gate = ms.ops.sigmoid(input_gate) + update_gate = ms.ops.sigmoid(update_gate) + param_out = self.norm_out(param_out) + input_out = self.input_norm_out(input_out) + + if self.activate_out: + param_out = self.activation(param_out) + input_out = self.activation(input_out) + + # param_out has shape (batch_size, feat_channels, out_channels) + features = update_gate * expand_dims(param_out, -2) + input_gate * input_out + + features = self.fc_layer(features) + features = self.fc_norm(features) + features = self.activation(features) + + return features diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/rpn/__init__.py b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4f96c15807d6ba6a00c589cc2181e8677d1c44dd --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/rpn/kernel_head.py b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/kernel_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7ac7a9e3ac2123d6eada046cdc209e40d9cf915c --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/kernel_head.py @@ -0,0 +1,558 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import mindspore as ms +from mindspore import nn, ops +from mindspore import log as logger +from mindspore.common import initializer as init +from mindspore.communication.management import GlobalComm, get_group_size +from src.model_utils.configs import Config +from .semantic_fpn_wrapper import SemanticFPNWrapper +from ..custom_cells import (ConvModule, normal_init, build_loss, multi_apply, + build_sampler, build_assigner) + + +def bias_init_with_prob(prior_prob: float) -> float: + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +class ConvKernelHead(nn.Cell): + + def __init__(self, thing_label_in_seg=0, cat_stuff_mask=False, **kwargs): + super(ConvKernelHead, self).__init__() + norm_cfg = kwargs.get('norm_cfg', None) + loss_mask = kwargs.get('loss_mask', None) + loss_seg = kwargs.get('loss_seg', None) + loss_cls = kwargs.get('loss_cls', None) + loss_dice = kwargs.get('loss_dice', None) + loss_rank = kwargs.get('loss_rank', None) + if isinstance(norm_cfg, type(None)): + norm_cfg = dict(type='GN', num_groups=32) + self.num_proposals = kwargs.get('num_proposals', 100) + self.num_cls_fcs = kwargs.get('num_cls_fcs', 1) + self.train_cfg = Config(kwargs.get('train_cfg', None)) + self.in_channels = kwargs.get('in_channels', 256) + self.out_channels = kwargs.get('out_channels', 256) + self.num_classes = kwargs.get('num_classes', 80) + self.proposal_feats_with_obj = kwargs.get('proposal_feats_with_obj', False) + self.sampling = False + self.localization_fpn = SemanticFPNWrapper(**kwargs.get('localization_fpn', dict())) + self.semantic_fpn = kwargs.get('semantic_fpn', True) + self.norm_cfg = norm_cfg + self.num_heads = kwargs.get('num_heads', 8) + self.att_dropout = kwargs.get('att_dropout', False) + self.mask_out_stride = kwargs.get('mask_out_stride', 4) + self.hard_target = kwargs.get('hard_target', False) + self.conv_kernel_size = kwargs.get('conv_kernel_size', 1) + self.xavier_init_kernel = kwargs.get('xavier_init_kernel', False) + self.kernel_init_std = kwargs.get('kernel_init_std', 0.01) + self.feat_downsample_stride = kwargs.get('feat_downsample_stride', 1) + self.feat_refine_stride = kwargs.get('feat_refine_stride', 1) + self.conv_normal_init = kwargs.get('conv_normal_init', False) + self.feat_refine = kwargs.get('feat_refine', True) + self.with_embed = kwargs.get('with_embed', False) + self.feat_embed_only = kwargs.get('feat_embed_only', False) + self.num_loc_convs = kwargs.get('num_loc_convs', 1) + self.num_seg_convs = kwargs.get('num_seg_convs', 1) + self.use_binary = kwargs.get('use_binary', False) + self.num_thing_classes = kwargs.get('num_thing_classes', 80) + self.num_stuff_classes = kwargs.get('num_stuff_classes', 53) + self.mask_assign_stride = kwargs.get('mask_assign_stride', 4) + self.ignore_label = kwargs.get('ignore_label', 255) + self.thing_label_in_seg = thing_label_in_seg + self.cat_stuff_mask = cat_stuff_mask + + self.loss_mask = ops.BinaryCrossEntropy() + if loss_mask is not None: + self.loss_mask = build_loss(loss_mask) + else: + self.loss_mask = loss_mask + + if loss_dice is not None: + self.loss_dice = build_loss(loss_dice) + else: + self.loss_dice = loss_dice + + if loss_seg is not None: + self.loss_seg = build_loss(loss_seg) + else: + self.loss_seg = loss_seg + if loss_cls is not None: + self.loss_cls = build_loss(loss_cls) + else: + self.loss_cls = loss_cls + + if loss_rank is not None: + self.loss_rank = build_loss(loss_rank) + else: + self.loss_rank = loss_rank + + if self.train_cfg: + self.assigner = build_assigner(self.train_cfg.assigner) + # use PseudoSampler when sampling is False + if self.sampling and hasattr(self.train_cfg, 'sampler'): + sampler_cfg = self.train_cfg.sampler + else: + sampler_cfg = dict(type='MaskPseudoSampler') + self.sampler = build_sampler(sampler_cfg) + self._init_layers() + self.allreduce = ops.AllReduce(ops.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) + self.init_weights() + self.sigmoid = ops.Sigmoid() + + def init_weights(self): + self.localization_fpn.init_weights() + + if self.feat_downsample_stride > 1 and self.conv_normal_init: + logger.info('Initialize convs in KPN head by normal std 0.01') + for conv in [self.loc_convs, self.seg_convs]: + for m in conv.cells_and_names(): + if isinstance(m, nn.Conv2d): + normal_init(m, init_gain=0.01) + + if self.semantic_fpn: + bias_seg = bias_init_with_prob(0.01) + if self.loss_seg.use_sigmoid: + normal_init(self.conv_seg, init_gain=0.01, bias=bias_seg) + else: + normal_init(self.conv_seg, mean=0, init_gain=0.01) + if self.xavier_init_kernel: + logger.info('Initialize kernels by xavier uniform') + self.init_kernels.weight.set_data( + init.initializer(init.XavierUniform(), self.init_kernels.weight.shape)) + else: + logger.info( + f'Initialize kernels by normal std: {self.kernel_init_std}') + normal_init(self.init_kernels, mean=0, init_gain=self.kernel_init_std) + + def forward_train(self, + img, + gt_masks, + gt_labels, + **kwargs,): + """Forward function in training stage.""" + img_metas = kwargs.get('img_metas', None) + gt_sem_seg = kwargs.get('gt_sem_seg', None) + gt_sem_cls = kwargs.get('gt_sem_cls', None) + num_imgs = len(img_metas) + results = self._decode_init_proposals(img, img_metas) + (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results + if self.feat_downsample_stride > 1: + interpolate = nn.ResizeBilinear() + scaled_mask_preds = interpolate( + mask_preds, + scale_factor=self.feat_downsample_stride, + align_corners=False) + if seg_preds is not None: + scaled_seg_preds = interpolate( + seg_preds, + scale_factor=self.feat_downsample_stride, + align_corners=False) + else: + scaled_seg_preds = None + else: + scaled_mask_preds = mask_preds + scaled_seg_preds = seg_preds + + if self.hard_target: + gt_masks = [x.bool().astype(ms.float32) for x in gt_masks] + else: + gt_masks = gt_masks + + sampling_results = [] + if cls_scores is None: + detached_cls_scores = [None] * num_imgs + else: + detached_cls_scores = ops.stop_gradient(cls_scores) + for i in range(num_imgs): + assign_result = self.assigner.assign(ops.stop_gradient(scaled_mask_preds[i]), + detached_cls_scores[i], + gt_masks[i], gt_labels[i], + img_metas[i]) + sampling_result = self.sampler.sample(assign_result, + scaled_mask_preds[i], + gt_masks[i]) + sampling_results.append(sampling_result) + + mask_targets = self.get_targets( + sampling_results, + gt_masks, + self.train_cfg, + True, + gt_sem_seg=gt_sem_seg, + gt_sem_cls=gt_sem_cls) + + losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds, + proposal_feats, *mask_targets) + + if self.cat_stuff_mask and self.training: + mask_preds = ops.concat( + [mask_preds, seg_preds[:, self.num_thing_classes:]], axis=1) + stuff_kernels = self.conv_seg.weight[self. + num_thing_classes:].clone() + stuff_kernels = stuff_kernels[None].broadcast_to((num_imgs, ) + stuff_kernels.shape) + proposal_feats = ops.concat([proposal_feats, stuff_kernels], axis=1) + results = (losses, proposal_feats, x_feats, mask_preds, cls_scores) + return results + + def loss(self, + mask_pred, + cls_scores, + seg_preds, + proposal_feats, + labels, + label_weights, + mask_targets, + mask_weights, + seg_targets, + **kwargs): + losses = dict() + bg_class_ind = self.num_classes + # note in spare rcnn num_gt == num_pos + pos_inds = (labels >= 0).astype(ms.int32) & (labels < bg_class_ind).astype(ms.int32) + num_preds = mask_pred.shape[0] * mask_pred.shape[1] + if cls_scores is not None: + raise NotImplementedError + + bool_pos_inds = pos_inds.astype(ms.bool_) + # 0~self.num_classes-1 are FG, self.num_classes is BG + # do not perform bounding box regression for BG anymore. + height, width = mask_pred.shape[-2:] + if bool_pos_inds.sum(): + candi_index = ops.nonzero(bool_pos_inds).squeeze(-1) + pos_mask_pred = mask_pred.reshape(num_preds, height, width)[candi_index] + pos_mask_targets = mask_targets[candi_index] + losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred, + pos_mask_targets) + losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred, + pos_mask_targets) + + if self.loss_rank is not None: + raise NotImplementedError + + else: + losses['loss_rpn_mask'] = mask_pred.sum() * 0 + losses['loss_rpn_dice'] = mask_pred.sum() * 0 + if self.loss_rank is not None: + losses['loss_rank'] = mask_pred.sum() * 0 + + if seg_preds is not None: + if self.loss_seg.use_sigmoid: + losses['loss_rpn_seg'] = self.loss_seg(seg_preds.squeeze(1), seg_targets.astype(ms.float32)) + else: + raise NotImplementedError + + return losses + + def get_targets(self, + sampling_results, + gt_mask, + rpn_train_cfg, + concat=True, + gt_sem_seg=None, + gt_sem_cls=None): + pos_inds_list = [res.pos_inds for res in sampling_results] + neg_inds_list = [res.neg_inds for res in sampling_results] + pos_mask_list = [res.pos_masks for res in sampling_results] + neg_mask_list = [res.neg_masks for res in sampling_results] + pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results] + pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] + if gt_sem_seg is None: + # me: fix hard-code bug. + num_imgs = len(sampling_results) + gt_sem_seg = [None] * num_imgs + gt_sem_cls = [None] * num_imgs + results = multi_apply( + self._get_target_single, + pos_inds_list, + neg_inds_list, + pos_mask_list, + neg_mask_list, + pos_gt_mask_list, + pos_gt_labels_list, + gt_sem_seg, + gt_sem_cls, + cfg=rpn_train_cfg) + (labels, label_weights, mask_targets, mask_weights, + seg_targets) = results + if concat: + labels = ops.concat(labels, 0) + label_weights = ops.concat(label_weights, 0) + mask_targets = ops.concat(mask_targets, 0) + mask_weights = ops.concat(mask_weights, 0) + seg_targets = ops.stack(seg_targets, 0) + results = (labels, label_weights, mask_targets, mask_weights, seg_targets) + return results + + def simple_test_rpn(self, img, img_metas): + """Forward function in testing stage.""" + return self._decode_init_proposals(img, img_metas) + + def forward_dummy(self, img, img_metas): + """Dummy forward function. + + Used in flops calculation. + """ + return self._decode_init_proposals(img, img_metas) + + def onnx_export(self, x): + """Test without augmentation. + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + img_metas (list[dict]): Meta info of each image. + Returns: + Tensor: dets of shape [N, num_det, 5]. + """ + rpn_results = self._decode_init_proposals_export(x) + + (proposal_feats, x_feats, mask_preds, cls_scores, + seg_preds) = rpn_results + return rpn_results + + def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, + pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls, + cfg): + num_pos = pos_mask.shape[0] + num_neg = neg_mask.shape[0] + num_samples = num_pos + num_neg + height, width = pos_mask.shape[-2:] + # original implementation uses new_zeros since BG are set to be 0 + # now use empty & fill because BG cat_id = num_classes, + # FG cat_id = [0, num_classes-1] + labels = ms.numpy.full((num_samples, ), + self.num_classes, + dtype=ms.int64) + new_zeros = ops.Zeros() + type_ = pos_mask.dtype + label_weights = new_zeros((num_samples, ), type_) + mask_targets = new_zeros((num_samples, height, width), type_) + mask_weights = new_zeros((num_samples, height, width), type_) + seg_targets = ms.numpy.full((height, width), + self.num_classes, + dtype=ms.int64) + + if gt_sem_cls is not None and gt_sem_seg is not None: + gt_sem_seg = gt_sem_seg.bool() + for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls): + seg_targets[sem_mask] = sem_cls.astype(ms.int64) + if num_pos > 0: + labels[pos_inds] = pos_gt_labels + pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight + label_weights[pos_inds] = pos_weight + mask_targets[pos_inds] = pos_gt_mask + mask_weights[pos_inds] = 1 + for i in range(num_pos): + seg_targets[pos_gt_mask[i].astype(ms.bool_)] = pos_gt_labels[i] + + if num_neg > 0: + label_weights[neg_inds] = 1.0 + results = (labels, label_weights, mask_targets, mask_weights, seg_targets) + return results + + def _decode_init_proposals_export(self, img): + num_imgs = 1 + localization_feats = self.localization_fpn.model_export(img) + + if isinstance(localization_feats, list): + loc_feats = localization_feats[0] + else: + loc_feats = localization_feats + for conv in self.loc_convs: + loc_feats = conv(loc_feats) + if self.feat_downsample_stride > 1 and self.feat_refine: + loc_feats = self.ins_downsample(loc_feats) + mask_preds = self.init_kernels(loc_feats) + + if self.semantic_fpn: + if isinstance(localization_feats, list): + semantic_feats = localization_feats[1] + else: + semantic_feats = localization_feats + for conv in self.seg_convs: + semantic_feats = conv(semantic_feats) + if self.feat_downsample_stride > 1 and self.feat_refine: + semantic_feats = self.seg_downsample(semantic_feats) + else: + semantic_feats = None + + if semantic_feats is not None: + seg_preds = self.conv_seg(semantic_feats) + else: + seg_preds = None + + tmp_feat = np.array(self.init_kernels.weight).astype(np.float32) + tmp_feat = np.broadcast_to(tmp_feat[None], (num_imgs, ) + tmp_feat.shape) + proposal_feats = ms.Tensor(np.copy(tmp_feat), dtype=self.init_kernels.weight.dtype) + + if semantic_feats is not None: + x_feats = semantic_feats + loc_feats + else: + x_feats = loc_feats + + if self.proposal_feats_with_obj: + sigmoid_masks = self.sigmoid(mask_preds) + nonzero_inds = sigmoid_masks > 0.5 + if self.use_binary: + sigmoid_masks = nonzero_inds.astype(ms.float32) + else: + sigmoid_masks = nonzero_inds.astype(ms.float32) * sigmoid_masks + b, n, h, w = sigmoid_masks.shape + _, c, _, _ = x_feats.shape + tmp_sigmoid_masks = ms.ops.reshape(sigmoid_masks, (b, n, h*w)) + tmp_x_feats = ms.ops.reshape(x_feats, (b, c, h*w)) + tmp_x_feats = ms.ops.transpose(tmp_x_feats, (0, 2, 1)) + obj_feats = ms.ops.bmm(tmp_sigmoid_masks, tmp_x_feats) + else: + obj_feats = None + + cls_scores = None + + if self.proposal_feats_with_obj: + proposal_feats = proposal_feats + obj_feats.view( + num_imgs, self.num_proposals, self.out_channels, 1, 1) + + if self.cat_stuff_mask and not self.training: + mask_preds = ops.concat( + [mask_preds, seg_preds[:, self.num_thing_classes:]], axis=1) + stuff_kernels = self.conv_seg.weight[self. + num_thing_classes:].clone() + stuff_kernels = ms.ops.broadcast_to(stuff_kernels[None], (num_imgs, ) + stuff_kernels.shape) + proposal_feats = ops.concat([proposal_feats, stuff_kernels], axis=1) + results = (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) + return results + + def _decode_init_proposals(self, img, img_metas): + num_imgs = len(img_metas) + localization_feats = self.localization_fpn(img) + if isinstance(localization_feats, list): + loc_feats = localization_feats[0] + else: + loc_feats = localization_feats + for conv in self.loc_convs: + loc_feats = conv(loc_feats) + if self.feat_downsample_stride > 1 and self.feat_refine: + loc_feats = self.ins_downsample(loc_feats) + mask_preds = self.init_kernels(loc_feats) + + if self.semantic_fpn: + if isinstance(localization_feats, list): + semantic_feats = localization_feats[1] + else: + semantic_feats = localization_feats + for conv in self.seg_convs: + semantic_feats = conv(semantic_feats) + if self.feat_downsample_stride > 1 and self.feat_refine: + semantic_feats = self.seg_downsample(semantic_feats) + else: + semantic_feats = None + + if semantic_feats is not None: + seg_preds = self.conv_seg(semantic_feats) + else: + seg_preds = None + + + + proposal_feats = self.init_kernels.weight.clone() + proposal_feats = proposal_feats[None].broadcast_to((num_imgs, ) + proposal_feats.shape) + + if semantic_feats is not None: + x_feats = semantic_feats + loc_feats + else: + x_feats = loc_feats + + if self.proposal_feats_with_obj: + sigmoid_masks = self.sigmoid(mask_preds) + nonzero_inds = sigmoid_masks > 0.5 + if self.use_binary: + sigmoid_masks = nonzero_inds.astype(ms.float32) + else: + sigmoid_masks = nonzero_inds.astype(ms.float32) * sigmoid_masks + einsum = ops.Einsum('bnhw,bchw->bnc') + obj_feats = einsum((sigmoid_masks, x_feats)) + else: + obj_feats = None + + cls_scores = None + + if self.proposal_feats_with_obj: + proposal_feats = proposal_feats + obj_feats.view( + num_imgs, self.num_proposals, self.out_channels, 1, 1) + + if self.cat_stuff_mask and not self.training: + mask_preds = ops.concat( + [mask_preds, seg_preds[:, self.num_thing_classes:]], axis=1) + stuff_kernels = self.conv_seg.weight[self. + num_thing_classes:].clone() + stuff_kernels = stuff_kernels[None].broadcast_to((num_imgs, ) + stuff_kernels.shape) + proposal_feats = ops.concat([proposal_feats, stuff_kernels], axis=1) + results = (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) + return results + + def _init_layers(self): + """Initialize a sparse set of proposal boxes and proposal features.""" + self.init_kernels = nn.Conv2d( + self.out_channels, + self.num_proposals, + self.conv_kernel_size, + padding=int(self.conv_kernel_size // 2), + has_bias=False) + + if self.semantic_fpn: + if self.loss_seg.use_sigmoid: + self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, + 1) + else: + self.conv_seg = nn.Conv2d(self.out_channels, + self.num_classes + 1, 1) + + if self.feat_downsample_stride > 1 and self.feat_refine: + self.ins_downsample = ConvModule( + self.in_channels, + self.out_channels, + 3, + stride=self.feat_refine_stride, + padding=1, + norm_cfg=self.norm_cfg) + self.seg_downsample = ConvModule( + self.in_channels, + self.out_channels, + 3, + stride=self.feat_refine_stride, + padding=1, + norm_cfg=self.norm_cfg) + + self.loc_convs = nn.CellList() + for i in range(self.num_loc_convs): + self.loc_convs.append( + ConvModule( + self.in_channels, + self.out_channels, + 1, + norm_cfg=self.norm_cfg)) + + self.seg_convs = nn.CellList() + for i in range(self.num_seg_convs): + self.seg_convs.append( + ConvModule( + self.in_channels, + self.out_channels, + 1, + norm_cfg=self.norm_cfg)) diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/rpn/positional_encoding.py b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/positional_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..7e3bfd66aad5fd918bf985983b36d8a27e4f11be --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/positional_encoding.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import numpy as np +import mindspore as ms +from mindspore import nn +from mindspore import ops + + +class SinePositionalEncoding(nn.Cell): + """Position encoding with sine and cosine functions. + + See `End-to-End Object Detection with Transformers + `_ for details. + + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. Note the final returned dimension + for each position is 2 times of this value. + temperature (int, optional): The temperature used for scaling + the position embedding. Defaults to 10000. + normalize (bool, optional): Whether to normalize the position + embedding. Defaults to False. + scale (float, optional): A scale factor that scales the position + embedding. The scale will be used only when `normalize` is True. + Defaults to 2*pi. + eps (float, optional): A value added to the denominator for + numerical stability. Defaults to 1e-6. + offset (float): offset add to embed when do the normalization. + Defaults to 0. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, num_feats, normalize=False, scale=2 * math.pi, **kwargs): + super(SinePositionalEncoding, self).__init__() + if normalize: + assert isinstance(scale, (float, int)), 'when normalize is set,' \ + 'scale should be provided and in float or int type, ' \ + f'found {type(scale)}' + self.num_feats = num_feats + self.temperature = kwargs.get('temperature', 10000) + self.normalize = normalize + self.scale = scale + self.eps = kwargs.get('eps', 1e-6) + self.offset = kwargs.get('offset', 0.) + + def __repr__(self): + """str: a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f'(num_feats={self.num_feats}, ' + repr_str += f'temperature={self.temperature}, ' + repr_str += f'normalize={self.normalize}, ' + repr_str += f'scale={self.scale}, ' + repr_str += f'eps={self.eps})' + return repr_str + + def construct(self, mask): + """Forward function for `SinePositionalEncoding`. + + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + # For convenience of exporting to ONNX, it's required to convert + # `masks` from bool to int. + mask = mask.astype(ms.int32) + not_mask = 1 - mask # logical_not + y_embed = not_mask.cumsum(1, dtype=ms.float32) + x_embed = not_mask.cumsum(2, dtype=ms.float32) + + if self.normalize: + y_embed = (y_embed + self.offset) / \ + (y_embed[:, -1:, :] + self.eps) * self.scale + x_embed = (x_embed + self.offset) / \ + (x_embed[:, :, -1:] + self.eps) * self.scale + dim_t = ms.Tensor(np.arange(self.num_feats), dtype=ms.float32) + dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats) + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + # use `view` instead of `flatten` for dynamically exporting to ONNX + batch_size, height, width = mask.shape + sin = ops.Sin() + cos = ops.Cos() + pos_x = ops.stack( + (sin(pos_x[:, :, :, 0::2]), cos(pos_x[:, :, :, 1::2])), + axis=4).view(batch_size, height, width, -1) + pos_y = ops.stack( + (sin(pos_y[:, :, :, 0::2]), cos(pos_y[:, :, :, 1::2])), + axis=4).view(batch_size, height, width, -1) + pos = ops.concat((pos_y, pos_x), axis=3).transpose((0, 3, 1, 2)) + return pos + + def model_export(self, mask): + """Forward function for `SinePositionalEncoding`. + + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + # For convenience of exporting to ONNX, it's required to convert + # `masks` from bool to int. + mask = mask.astype(ms.int32) + not_mask = 1 - mask # logical_not + + tmp_not_mask = np.array(not_mask, dtype=np.int32) + y_embed = np.cumsum(tmp_not_mask, axis=1, dtype=np.float32) + x_embed = np.cumsum(tmp_not_mask, axis=2, dtype=np.float32) + + if self.normalize: + y_embed = (y_embed + self.offset) / \ + (y_embed[:, -1:, :] + self.eps) * self.scale + x_embed = (x_embed + self.offset) / \ + (x_embed[:, :, -1:] + self.eps) * self.scale + dim_t = np.arange(self.num_feats).astype(np.float32) + dim_t = self.temperature**(2 * (dim_t / 2) / self.num_feats) + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + # use `view` instead of `flatten` for dynamically exporting to ONNX + batch_size, height, width = mask.shape + + tmp_pos_x = pos_x + tmp_pos_y = pos_y + tmp_pos_x = np.stack( + (np.sin(tmp_pos_x[:, :, :, 0::2]), np.cos(tmp_pos_x[:, :, :, 1::2])), axis=4 + ).reshape(batch_size, height, width, -1) + tmp_pos_y = np.stack( + (np.sin(tmp_pos_y[:, :, :, 0::2]), np.cos(tmp_pos_y[:, :, :, 1::2])), axis=4 + ).reshape(batch_size, height, width, -1) + tmp_pos = np.concatenate((tmp_pos_y, tmp_pos_x), axis=3).transpose((0, 3, 1, 2)) + pos = ms.Tensor(tmp_pos, dtype=ms.float32) + + return pos diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/rpn/semantic_fpn_wrapper.py b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/semantic_fpn_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..7f0e7744fe9487c50dc4131bd683a14954bdc9e6 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/rpn/semantic_fpn_wrapper.py @@ -0,0 +1,292 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import mindspore as ms +from mindspore import nn, ops +from mindspore import log as logger +from ..custom_cells import CustomResizeBilinear, ConvModule, normal_init +from .positional_encoding import SinePositionalEncoding + + +class SemanticFPNWrapper(nn.Cell): + """Implementation of Semantic FPN used in Panoptic FPN. + + Args: + in_channels ([type]): [description] + feat_channels ([type]): [description] + out_channels ([type]): [description] + start_level ([type]): [description] + end_level ([type]): [description] + cat_coors (bool, optional): [description]. Defaults to False. + fuse_by_cat (bool, optional): [description]. Defaults to False. + conv_cfg ([type], optional): [description]. Defaults to None. + norm_cfg ([type], optional): [description]. Defaults to None. + """ + + def __init__(self, in_channels, feat_channels, out_channels, **kwargs): + super(SemanticFPNWrapper, self).__init__() + start_level = kwargs.get('start_level', -1) + end_level = kwargs.get('end_level', -1) + positional_encoding = kwargs.get('positional_encoding', None) + cat_coors_level = kwargs.get('cat_coors_level', 3) + fuse_by_cat = kwargs.get('fuse_by_cat', False) + upsample_times = kwargs.get('upsample_times', 3) + num_aux_convs = kwargs.get('num_aux_convs', 0) + act_cfg = kwargs.get('act_cfg', None) + out_act_cfg = kwargs.get('out_act_cfg', None) + # init dict-like arguments + if isinstance(act_cfg, type(None)): + act_cfg = dict(type='ReLU', inplace=True) + if isinstance(out_act_cfg, type(None)): + out_act_cfg = dict(type='ReLU') + + self.in_channels = in_channels + self.feat_channels = feat_channels + self.start_level = start_level + self.end_level = end_level + assert start_level >= 0 and end_level >= start_level + self.out_channels = out_channels + self.conv_cfg = kwargs.get('conv_cfg', None) + self.norm_cfg = kwargs.get('norm_cfg', None) + self.act_cfg = act_cfg + self.cat_coors = kwargs.get('cat_coors', False) + self.cat_coors_level = cat_coors_level + self.fuse_by_cat = fuse_by_cat + self.return_list = kwargs.get('return_list', False) + self.upsample_times = upsample_times + self.with_pred = kwargs.get('with_pred', True) + if positional_encoding is not None: + self.positional_encoding = SinePositionalEncoding(**positional_encoding) + else: + self.positional_encoding = None + self.convs_all_levels = nn.CellList() + for i in range(self.start_level, self.end_level + 1): + convs_per_level = nn.SequentialCell() + if i == 0: + if i == self.cat_coors_level and self.cat_coors: + chn = self.in_channels + 2 + else: + chn = self.in_channels + if upsample_times == self.end_level - i: + one_conv = ConvModule( + chn, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + convs_per_level.append(one_conv) + else: + for ii in range(self.end_level - upsample_times): + one_conv = ConvModule( + chn, + self.feat_channels, + 3, + padding=1, + stride=2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + convs_per_level.append(one_conv) + self.convs_all_levels.append(convs_per_level) + continue + + for j in range(i): + if j == 0: + if i == self.cat_coors_level and self.cat_coors: + chn = self.in_channels + 2 + else: + chn = self.in_channels + one_conv = ConvModule( + chn, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + convs_per_level.append(one_conv) + if j < upsample_times - (self.end_level - i): + one_upsample = CustomResizeBilinear( + scale_factor=2, align_corners=False) + convs_per_level.append(one_upsample) + continue + + one_conv = ConvModule( + self.feat_channels, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + convs_per_level.append(one_conv) + if j < upsample_times - (self.end_level - i): + one_upsample = CustomResizeBilinear( + scale_factor=2, align_corners=False) + convs_per_level.append(one_upsample) + + self.convs_all_levels.append(convs_per_level) + + if fuse_by_cat: + in_channels = self.feat_channels * len(self.convs_all_levels) + else: + in_channels = self.feat_channels + + if self.with_pred: + self.conv_pred = ConvModule( + in_channels, + self.out_channels, + 1, + padding=0, + conv_cfg=self.conv_cfg, + act_cfg=out_act_cfg, + norm_cfg=self.norm_cfg) + + self.num_aux_convs = num_aux_convs + self.aux_convs = nn.CellList() + for i in range(num_aux_convs): + self.aux_convs.append( + ConvModule( + in_channels, + self.out_channels, + 1, + padding=0, + conv_cfg=self.conv_cfg, + act_cfg=out_act_cfg, + norm_cfg=self.norm_cfg)) + + def init_weights(self): + logger.info('Use normal intialization for semantic FPN') + for m in self.cells_and_names(): + if isinstance(m, nn.Conv2d): + normal_init(m, init_gain=0.01) + + def generate_coord(self, input_feat): + x_range = ops.linspace(ms.Tensor(-1, dtype=ms.float32), + ms.Tensor(1, dtype=ms.float32), + input_feat.shape[-1]) + y_range = ops.linspace(ms.Tensor(-1, dtype=ms.float32), + ms.Tensor(1, dtype=ms.float32), + input_feat.shape[-2]) + y, x = ops.meshgrid((y_range, x_range)) + y = y.broadcast_to((input_feat.shape[0], 1, -1, -1)) + x = x.broadcast_to((input_feat.shape[0], 1, -1, -1)) + coord_feat = ops.concat([x, y], 1) + return coord_feat + + def construct(self, inputs): + mlvl_feats = [] + for i in range(self.start_level, self.end_level + 1): + input_p = inputs[i] + if i == self.cat_coors_level: + if self.positional_encoding is not None: + new_zeros = ops.Zeros() + ignore_mask = new_zeros( + (input_p.shape[0], input_p.shape[-2], + input_p.shape[-1]), ms.bool_) + + positional_encoding = self.positional_encoding(ignore_mask) + input_p = input_p + positional_encoding + if self.cat_coors: + coord_feat = self.generate_coord(input_p) + input_p = ops.concat([input_p, coord_feat], 1) + + mlvl_feats.append(self.convs_all_levels[i](input_p)) + + if self.fuse_by_cat: + feature_add_all_level = ops.concat(mlvl_feats, axis=1) + else: + feature_add_all_level = sum(mlvl_feats) + + if self.with_pred: + out = self.conv_pred(feature_add_all_level) + else: + out = feature_add_all_level + + if self.num_aux_convs > 0: + outs = [out] + for conv in self.aux_convs: + outs.append(conv(feature_add_all_level)) + return outs + + if self.return_list: + return [out] + else: + return out + + + def model_export(self, inputs): + mlvl_feats = [] + for i in range(self.start_level, self.end_level + 1): + input_p = inputs[i] + if i == self.cat_coors_level: + if self.positional_encoding is not None: + new_zeros = ops.Zeros() + ignore_mask = new_zeros( + (input_p.shape[0], input_p.shape[-2], + input_p.shape[-1]), ms.bool_) + + positional_encoding = self.positional_encoding.model_export(ignore_mask) + input_p = input_p + positional_encoding + if self.cat_coors: + coord_feat = self.generate_coord_export(input_p) + input_p = ops.concat([input_p, coord_feat], 1) + + mlvl_feats.append(self.convs_all_levels[i](input_p)) + + if self.fuse_by_cat: + feature_add_all_level = ops.concat(mlvl_feats, axis=1) + else: + feature_add_all_level = sum(mlvl_feats) + + if self.with_pred: + out = self.conv_pred(feature_add_all_level) + else: + out = feature_add_all_level + + if self.num_aux_convs > 0: + outs = [out] + for conv in self.aux_convs: + outs.append(conv(feature_add_all_level)) + return outs + + if self.return_list: + return [out] + else: + return out + + def generate_coord_export(self, input_feat): + x_range = ops.linspace(ms.Tensor(-1, dtype=ms.float32), + ms.Tensor(1, dtype=ms.float32), + input_feat.shape[-1]) + y_range = ops.linspace(ms.Tensor(-1, dtype=ms.float32), + ms.Tensor(1, dtype=ms.float32), + input_feat.shape[-2]) + y, x = ops.meshgrid((y_range, x_range)) + + tmp_y = np.array(y.numpy()) + tmp_y = np.broadcast_to(tmp_y, (input_feat.shape[0], 1, -1, -1)) + y = ms.Tensor(tmp_y, dtype=y.dtype) + + tmp_x = np.array(x.numpy()) + tmp_x = np.broadcast_to(tmp_x, (input_feat.shape[0], 1, -1, -1)) + x = ms.Tensor(tmp_x, dtype=y.dtype) + + coord_feat = ops.concat([x, y], 1) + return coord_feat \ No newline at end of file diff --git a/contrib/Overlap-Recovery/train/src/deoccluder/utils.py b/contrib/Overlap-Recovery/train/src/deoccluder/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..288ccecee38462c7db68da198cb88d22ba3a49b6 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/deoccluder/utils.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import mindspore as ms +from mindspore import nn, ops + + +def sem2ins_masks(gt_sem_seg, num_thing_classes=80): + """Convert semantic segmentation mask to binary masks + + Args: + gt_sem_seg (torch.Tensor): Semantic masks to be converted. + [0, num_thing_classes-1] is the classes of things, + [num_thing_classes:] is the classes of stuff. + num_thing_classes (int, optional): Number of thing classes. + Defaults to 80. + + Returns: + tuple[torch.Tensor]: (mask_labels, bin_masks). + Mask labels and binary masks of stuff classes. + """ + unique = ops.Unique() + classes = unique(gt_sem_seg) + masks = [] + labels = [] + + for i in classes: + # skip ignore class 255 and "thing classes" in semantic seg + if i == 255 or i < num_thing_classes: + continue + labels.append(i) + masks.append(gt_sem_seg == i) + + if len(labels) > 0: + stack = ops.Stack() + labels = stack(labels) + masks = ops.concat(masks) + else: + labels = gt_sem_seg.new_zeros(size=[0]) + masks = gt_sem_seg.new_zeros( + size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]]) + return labels.astype(ms.int64), masks.astype(ms.float32) diff --git a/contrib/Overlap-Recovery/train/src/model_utils/__init__.py b/contrib/Overlap-Recovery/train/src/model_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/Overlap-Recovery/train/src/model_utils/configs/__init__.py b/contrib/Overlap-Recovery/train/src/model_utils/configs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6a2dd17971a799a872c6d4acfa5e69b3367050be --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/model_utils/configs/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from .config_base import Config diff --git a/contrib/Overlap-Recovery/train/src/model_utils/configs/config_base.py b/contrib/Overlap-Recovery/train/src/model_utils/configs/config_base.py new file mode 100644 index 0000000000000000000000000000000000000000..bdd8a6e457079c6bae5b17ae505f6f224f2f2fd3 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/model_utils/configs/config_base.py @@ -0,0 +1,110 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pprint import pprint, pformat +from .config_model import model + + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + setattr(self, k, v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + def get(self, attr_name, default_value=None): + return getattr(self, attr_name, default_value) + + +SYNTH_DATA_ROOT = "root-directory-to-train-data" +REAL_DATA_ROOT = "root-directory-to-test-data" +IMG_SCALE = (768, 768) +IMG_NORM_CFG = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +TRAIN_PIPELINE = [ + dict(type='LoadImageFromFile'), + dict(type='CustomLoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=IMG_SCALE, keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **IMG_NORM_CFG), + dict(type='Pad', size_divisor=768), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'], + meta_keys=('ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction'), + ), +] +TEST_PIPELINE = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', img_scale=IMG_SCALE, keep_ratio=True), + dict(type='Normalize', **IMG_NORM_CFG), + dict(type='Pad', size_divisor=768, eval_model=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img'], + meta_keys=('ori_shape', 'img_shape', 'pad_shape', 'scale_factor'), + eval_mode=True), +] +CONFIG_DICT = dict( + model=model, + pre_trained="", + data=dict( + samples_per_gpu=8, + workers_per_gpu=8, + train=dict( + type='SynthOverlapDataset', + ann_file=SYNTH_DATA_ROOT + 'train_gt.jsonl', + img_prefix=SYNTH_DATA_ROOT, + pipeline=TRAIN_PIPELINE), + val=dict( + type='RealOverlapDataset', + ann_file=REAL_DATA_ROOT + 'annotation.json', + img_prefix=REAL_DATA_ROOT, + pipeline=TEST_PIPELINE, + test_mode=True), + test=dict( + type='RealOverlapDataset', + ann_file=REAL_DATA_ROOT + 'annotation.json', + img_prefix=REAL_DATA_ROOT, + pipeline=TEST_PIPELINE, + test_mode=True) + ), + train_cfg=dict( + # usually we only need to train 1 epoch to reach the desired performance + total_epoch=60, + optimizer='Adam', + lr=0.00005, + lr_power=4e-10, + wd=0.05, + save_iterval=1, + ckpt_max=10000, + ), + device_target='GPU', + mindrecord_dir='path-for-saving-logs-and-files', + pretrained_r50='path-to-pretrained-model', + do_eval=False, + run_distribute=False, + enable_modelarts=False, + checkpoint_path='path-to-checkpoint-model' +) + +config = Config(CONFIG_DICT) diff --git a/contrib/Overlap-Recovery/train/src/model_utils/configs/config_model.py b/contrib/Overlap-Recovery/train/src/model_utils/configs/config_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5e4e97db44a640dc2e960310378daaa85c0cee30 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/model_utils/configs/config_model.py @@ -0,0 +1,149 @@ + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +NUM_STAGES = 3 +NUM_PROPOSALS = 4 +CONV_KERNEL_SIZE = 1 +NUM_CLASSES = 1 +kernel_occlusion_cfg = dict( + num_proposals=NUM_PROPOSALS, + pair_manner='sum', + u_mask_loss=dict( + type='BinaryCrossEntropy', loss_weight=1.0), + i_mask_loss=dict( + type='BinaryCrossEntropy', loss_weight=1.0), + u_dice_loss=dict(type='DiceLoss', loss_weight=4.0), + i_dice_loss=dict(type='DiceLoss', loss_weight=4.0), +) +model = dict( + mask_assign_stride=4, + # origin size: 768 * 768 + feature_shapes=[[192, 192], [96, 96], [48, 48], [24, 24], [12, 12]], + backbone=dict( + layer_nums=[3, 4, 6, 3], + in_channels=[64, 256, 512, 1024], + out_channels=[256, 512, 1024, 2048]), + neck=dict( + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=4), + rpn_head=dict( + conv_kernel_size=CONV_KERNEL_SIZE, + feat_downsample_stride=2, + feat_refine_stride=1, + feat_refine=False, + use_binary=True, + num_loc_convs=1, + num_seg_convs=1, + conv_normal_init=True, + localization_fpn=dict( + in_channels=256, + feat_channels=256, + out_channels=256, + start_level=0, + end_level=3, + upsample_times=2, + positional_encoding=dict(num_feats=128, normalize=True), + cat_coors=False, + cat_coors_level=3, + fuse_by_cat=False, + return_list=False, + num_aux_convs=1, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), + num_proposals=NUM_PROPOSALS, + proposal_feats_with_obj=True, + xavier_init_kernel=False, + kernel_init_std=1, + num_cls_fcs=1, + in_channels=256, + num_classes=NUM_CLASSES, + feat_transform_cfg=None, + loss_seg=dict( + type='BinaryCrossEntropy', + loss_weight=1.0 + ), + loss_mask=dict( + type='BinaryCrossEntropy', loss_weight=1.0), + loss_dice=dict(type='DiceLoss', loss_weight=4.0), + train_cfg=dict( + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), + mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)), + sampler=dict(type='MaskPseudoSampler'), + pos_weight=1 + ), + test_cfg=None, + ), + roi_head=dict( + type='CustomKernelIterHead', + num_stages=NUM_STAGES, + stage_loss_weights=[1] * NUM_STAGES, + proposal_feature_channel=256, + mask_head=[ + dict( + kernel_occlusion_cfg=kernel_occlusion_cfg, + apply_kernel_occlusion=True, + num_classes=NUM_CLASSES, + num_ffn_fcs=2, + num_heads=8, + num_cls_fcs=1, + num_mask_fcs=1, + feedforward_channels=2048, + in_channels=256, + out_channels=256, + dropout=0.0, + mask_thr=0.5, + conv_kernel_size=CONV_KERNEL_SIZE, + mask_upsample_stride=2, + ffn_act_cfg=dict(type='ReLU', inplace=True), + with_ffn=True, + feat_transform_cfg=dict( + conv_cfg=dict(type='Conv2d'), act_cfg=None), + kernel_updator_cfg=dict( + in_channels=256, + feat_channels=256, + out_channels=256, + act_cfg=dict(type='ReLU', inplace=True)), + loss_mask=dict( + type='BinaryCrossEntropy', loss_weight=1.0), + loss_dice=dict( + type='DiceLoss', loss_weight=4.0), + loss_cls=dict( + type='SigmoidFocalClassificationLoss', + loss_weight=2.0), + num_proposals=NUM_PROPOSALS + ) for _ in range(NUM_STAGES) + ], + train_cfg=[ + dict( + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True), + mask_cost=dict(type='MaskCost', weight=1.0, + pred_act=True)), + sampler=dict(type='MaskPseudoSampler'), + pos_weight=1) for _ in range(NUM_STAGES) + ], + test_cfg=dict( + max_per_img=NUM_PROPOSALS, + mask_thr=0.5, + merge_stuff_thing=dict( + iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)) + ), + ) diff --git a/contrib/Overlap-Recovery/train/src/model_utils/device_adapter.py b/contrib/Overlap-Recovery/train/src/model_utils/device_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..e589ad560f2065b83a4bffecd4b9a45d34d313a0 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/model_utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from .configs.config_base import config + +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/contrib/Overlap-Recovery/train/src/model_utils/local_adapter.py b/contrib/Overlap-Recovery/train/src/model_utils/local_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..6b8285ca81a6010e5d69cb052059071c1c7d120d --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/model_utils/local_adapter.py @@ -0,0 +1,37 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/contrib/Overlap-Recovery/train/src/model_utils/moxing_adapter.py b/contrib/Overlap-Recovery/train/src/model_utils/moxing_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..d7dff5a29ed70bc2b61cd3cb9833603a1c1643f8 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/model_utils/moxing_adapter.py @@ -0,0 +1,124 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from mindspore.profiler import Profiler +from .configs.config_base import config + +_GLOBAL_SYNC_COUNT = 0 + + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _GLOBAL_SYNC_COUNT + sync_lock = "/tmp/copy_sync.lock" + str(_GLOBAL_SYNC_COUNT) + _GLOBAL_SYNC_COUNT += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + if config.enable_profiling: + profiler = Profiler() + + run_func(*args, **kwargs) + + if config.enable_profiling: + profiler.analyse() + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper diff --git a/contrib/Overlap-Recovery/train/src/utils/pth2ckpt.py b/contrib/Overlap-Recovery/train/src/utils/pth2ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..cb4a9ab591e84f9eeea19bed6cc7710619550e65 --- /dev/null +++ b/contrib/Overlap-Recovery/train/src/utils/pth2ckpt.py @@ -0,0 +1,48 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import argparse +import json +import torch +from mindspore import Tensor +from mindspore.train.serialization import save_checkpoint + + + +parser = argparse.ArgumentParser(description="trans pth to ckpt") +parser.add_argument('--pth-path', type=str, default='resnet50-19c8e357.pth', help="The path of pth file") +parser.add_argument('--ckpt-path', type=str, default='pretrained_resnet50.ckpt', help='The path to save ckpt file') +parser.add_argument('--dict-file', type=str, required=True, help='dict file') + +args = parser.parse_args() + +pth_dict = torch.load(args.pth_path) + + +with open(args.dict_file, 'r') as f: + name_dict = json.load(f) + +new_param_list = [] + +for pth_name, ckpt_name in name_dict.items(): + param_dict = {} + data = pth_dict[pth_name] + param_dict['name'] = ckpt_name + param_dict['data'] = Tensor(data.detach().numpy()) + new_param_list.append(param_dict) + + +save_checkpoint(new_param_list, args.ckpt_path) +print(f'The ckpt file is saved in {args.ckpt_path}') diff --git a/contrib/Overlap-Recovery/train/train.py b/contrib/Overlap-Recovery/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..fcd7ea8de20bc5e19487341f8b1b535d868e3716 --- /dev/null +++ b/contrib/Overlap-Recovery/train/train.py @@ -0,0 +1,135 @@ +"""train model.""" + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import time +import os +import numpy as np + +from src.model_utils.configs.config_base import config +from src.model_utils.device_adapter import get_device_id, get_device_num +from src.dataset import build_dataset +from src.deoccluder import CustomKNet, TrainModelWrapper +from loguru import logger + +import mindspore.common.dtype as mstype +from mindspore import context, Tensor, Parameter +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor, LossMonitor +from mindspore.train import Model +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.common import set_seed +from mindspore import dataset as de + + +# set fixed seed +set_seed(1) + + +def load_pretrained_ckpt(net, load_path, device_target): + param_dict = load_checkpoint(load_path) + + if config.pretrain_epoch_size == 0: + key_mapping = {'down_sample_layer.1.beta': 'bn_down_sample.beta', + 'down_sample_layer.1.gamma': 'bn_down_sample.gamma', + 'down_sample_layer.0.weight': 'conv_down_sample.weight', + 'down_sample_layer.1.moving_mean': 'bn_down_sample.moving_mean', + 'down_sample_layer.1.moving_variance': 'bn_down_sample.moving_variance', + } + for oldkey in list(param_dict.keys()): + if not oldkey.startswith(('backbone', 'end_point', 'global_step', + 'learning_rate', 'moments', 'momentum')): + data = param_dict.pop(oldkey) + newkey = 'backbone.' + oldkey + param_dict[newkey] = data + oldkey = newkey + for k, v in key_mapping.items(): + if k in oldkey: + newkey = oldkey.replace(k, v) + param_dict[newkey] = param_dict.pop(oldkey) + break + + for item in list(param_dict.keys()): + if not (item.startswith('backbone') or item.startswith('rcnn_mask')): + param_dict.pop(item) + + if device_target == 'GPU': + for key, value in param_dict.items(): + tensor = Tensor(value, mstype.float32) + param_dict[key] = Parameter(tensor, key) + + load_param_into_net(net, param_dict) + return net + + +def train_model(): + device_target = config.device_target + context.set_context(mode=context.PYNATIVE_MODE, device_target=device_target, device_id=get_device_id()) + + logger.info("Start train!") + rank = 0 + + logger.info("Start create dataset!") + + # It will generate mindrecord file in config.mindrecord_dir + if rank == 0 and not os.path.exists(config.mindrecord_dir): + os.makedirs(config.mindrecord_dir) + if rank == 0: + logger.add(os.path.join(config.mindrecord_dir, time.asctime(time.localtime()).replace(' ', '_')+".log")) + + # prepare dataset + train_set = build_dataset(config.data['train']) + collect_pipe = config.data['train']['pipeline'][-1] + column_names = list(collect_pipe['keys']) + list(collect_pipe['meta_keys']) + train_set = de.GeneratorDataset(train_set, + column_names=column_names, + num_parallel_workers=config.data['workers_per_gpu'], + shuffle=False) + train_set = train_set.batch(config.data['samples_per_gpu'], drop_remainder=True) + + # Prepare model + config.train = True + net = CustomKNet(config.model) + net = net.set_train() + net.load_r50(config.pretrained_r50) + net = TrainModelWrapper(net) + # load checkpoint or pretrained model + load_path = config.pre_trained + if load_path != "": + logger.info(f"Loading pretrained checkpoint from {load_path}") + net = load_pretrained_ckpt(net=net, load_path=load_path, device_target=device_target) + + # Learning rate adjustment. + steps_per_epoch = train_set.get_dataset_size() + + # Create model + model = Model(net) + + # Callbacks + time_cb = TimeMonitor(data_size=10) + loss_cb = LossMonitor(per_print_times=10) + + # Save-checkpoint callback + ckpt_config = CheckpointConfig(save_checkpoint_steps=min(500, steps_per_epoch * config.train_cfg['save_iterval']), + keep_checkpoint_max=config.train_cfg['ckpt_max']) + ckpt_cb = ModelCheckpoint(prefix='{}'.format("KNet_Deoccluder_SGD"), + directory=config.mindrecord_dir + "/card" + str(rank), + config=ckpt_config) + cb = [time_cb, loss_cb, ckpt_cb] + model.train(config.train_cfg['total_epoch'], train_set, callbacks=cb, dataset_sink_mode=False) + + +if __name__ == '__main__': + train_model()