diff --git a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/README.md b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/README.md index 44706889ced5202644bf26f0d1ac32f5269ce812..5f28c3728fbe46c9d897a951e6260b35b6bb21ee 100644 --- a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/README.md +++ b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/README.md @@ -41,7 +41,7 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 | 输入数据 | 大小 | 数据类型 | 数据排布格式 | |----------|------------------| ------------------------- | ------------ | | image | ${bs} x 3 x 224 x 224 | FLOAT32 | NCHW | - + | txt | ${bs} x 512 | INT32 | ND | - 输出数据 @@ -57,8 +57,8 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 | 配套 | 版本 | 环境准备指导 | | ------------------------------------------------------------ |---------| ------------------------------------------------------------ | -| 固件与驱动 | 24.0.RC2 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | -| CANN(+ MindIE) | 8.0.RC2 | - | +| 固件与驱动 | 24.1.RC2 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | +| CANN(+ MindIE) | 8.0.RC3 | - | | Python | 3.8.17 | - | | PyTorch | 1.11.0 | - | | 说明:Atlas 300I Duo 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | @@ -87,7 +87,7 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 pip3 install -r requirements.txt # 修改第三方源码推理适配部分 - patch -p0 < ../cn_clip.patch + patch -p2 < ../cn_clip.patch pip3 install -e . cd .. @@ -122,6 +122,13 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 下载[权重文件](https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_vit-b-16.pt) clip_cn_vit-b-16.pt 置于 models 目录下 + 2. 获取数据集。 + 下载[cifar-100数据集](http://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/cifar-100.zip) + ``` + mkdir -p ./Chinese-CLIP/data/datasets + ``` + 在dataset文件夹中解压缩 + 2. 导出onnx文件。 1. 使用 Chinese-CLIP/cn_clip/deploy/pytorch_to_onnx.py 导出onnx文件。 @@ -143,7 +150,7 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 - --convert-vision: 指定是否转文本侧模型 - --convert-vision: 指定是否转图像侧模型 - 运行成功后,使用models目录下生成的 vit-b-16.txt.fp16.onnx 和 vit-b-16.img.fp16.onnx 文件进行后续操作。 + 运行成功后,使用models目录下生成的 vit-b-16.txt.fp32.onnx 和 vit-b-16.img.fp32.onnx 文件进行后续操作。 3. 使用 onnx-simplifier 简化 onnx 模型。 @@ -152,15 +159,15 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 1. 文本模型 ```shell - # export bs=20 - onnxsim models/vit-b-16.txt.fp16.onnx models/vit-b-16.txt.fp16.bs${bs}.sim.onnx --overwrite-input-shape "text:${bs},512" + # export bs=24 + onnxsim models/vit-b-16.txt.fp32.onnx models/vit-b-16.txt.fp32.bs${bs}.sim.onnx --overwrite-input-shape "text:${bs},512" ``` 2. 图像模型 ```shell - # export bs=20 - onnxsim models/vit-b-16.img.fp16.onnx models/vit-b-16.img.fp16.bs${bs}.sim.onnx --overwrite-input-shape "image:${bs},3,224,224" + # export bs=24 + onnxsim models/vit-b-16.img.fp32.onnx models/vit-b-16.img.fp32.bs${bs}.sim.onnx --overwrite-input-shape "image:${bs},3,224,224" ``` 4. 使用 opt_onnx.py 优化 onnx 模型。 @@ -169,17 +176,17 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 1. 文本模型 ```shell - # export bs=20 - python3 opt_txt_onnx.py models/vit-b-16.txt.fp16.bs${bs}.sim.onnx models/vit-b-16.txt.fp16.bs${bs}.opt.onnx + # export bs=24 + python3 opt_txt_onnx.py models/vit-b-16.txt.fp32.bs${bs}.sim.onnx models/vit-b-16.txt.fp32.bs${bs}.opt.onnx ``` 2. 图像模型 - + ```shell - # export bs=20 + # export bs=24 python3 opt_img_onnx.py \ - --input_file models/vit-b-16.img.fp16.bs${bs}.sim.onnx \ - --output_file models/vit-b-16.img.fp16.bs${bs}.opt.onnx \ + --input_file models/vit-b-16.img.fp32.bs${bs}.sim.onnx \ + --output_file models/vit-b-16.img.fp32.bs${bs}.opt.onnx \ --model_config vit_base_patch16_224 \ --use_flashattention ``` @@ -219,10 +226,10 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 3. 执行ATC命令。 ```shell - # 例如 export bs=20 && export chip_name=310P3 + # 例如 export bs=24 && export chip_name=310P3 atc --framework=5 \ - --model=models/vit-b-16.txt.fp16.bs${bs}.opt.onnx \ - --output=models/vit-b-16.txt.fp16.bs${bs} \ + --model=models/vit-b-16.txt.fp32.bs${bs}.opt.onnx \ + --output=models/vit-b-16.txt.bs${bs} \ --input_format=ND \ --input_shape="text:${bs},512" \ --soc_version=Ascend${chip_name} \ @@ -230,17 +237,16 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 --optypelist_for_implmode="Gelu" \ --op_select_implmode=high_performance - # 例如 export bs=20 && export chip_name=310P3 + # 例如 export bs=24 && export chip_name=310P3 atc --framework=5 \ - --model=models/vit-b-16.img.fp16.bs${bs}.opt.onnx \ - --output=models/vit-b-16.img.fp16.bs${bs} \ + --model=models/vit-b-16.img.fp32.bs${bs}.opt.onnx \ + --output=models/vit-b-16.img.bs${bs} \ --input_format=NCHW \ --input_shape="image:${bs},3,224,224" \ --soc_version=Ascend${chip_name} \ --log=error \ --optypelist_for_implmode="Sigmoid" \ --op_select_implmode=high_performance \ - --insert_op_conf aipp.config \ --enable_small_channel 1 ``` - 参数说明: @@ -253,10 +259,9 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 - --soc_version:处理器型号。 - --optypelist_for_implmode:指定算子。 - --op_select_implmode:选择高性能/高精度模式,与 --optypelist_for_implmode 配合使用。 - - --insert_op_conf:AIPP配置文件。 - --enable_small_channel:与 --insert_op_conf 配合使用。 - 运行成功后,在 models 目录下生成 vit-b-16.img.fp16.${bs}.om 离线模型文件。 + 运行成功后,在 models 目录下生成 vit-b-16.img.bs${bs}.om 和 vit-b-16.txt.bs${bs}.om 离线模型文件。 6. 开始推理验证。 @@ -264,29 +269,29 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 请访问[ais_bench推理工具](https://gitee.com/ascend/tools/tree/master/ais-bench_workload/tool/ais_bench)代码仓,根据readme文档进行工具安装。 - 2. 余弦相似度精度验证。 + 2. 数据集精度验证 ``` - python3 compare.py --img-model-path models/vit-b-16.img.fp16.bs${bs}.om --batch-size ${bs} + cd Chinese-CLIP + export vision_om=../models/vit-b-16.img.bs24.om + export text_om=../models/vit-b-16.txt.bs24.om + bash run_scripts/zeroshot_eval.sh 0 data cifar-100 ViT-B-16 RoBERTa-wwm-ext-base-chinese ../models/clip_cn_vit-b-16.pt ${text_om} ${vision_om} + cd .. ``` + 得到数据集精度 top1: 64.04% - - 参数说明: - - - --img-model-path:图像侧模型所在路径 - - --batch-size:批次大小 - - 运行成功后打印om推理结果与cpu推理结果的余弦相似度,确保精度正常。 + 4. 性能验证。 纯推理性能测试命令如下: ```shell - # export bs=20 - python3 -m ais_bench --model models/vit-b-16.txt.fp16.bs${bs}.om --loop 50 + # export bs=24 + python3 -m ais_bench --model models/vit-b-16.txt.bs${bs}.om --loop 50 - # export bs=20 - python3 -m ais_bench --model models/vit-b-16.img.fp16.bs${bs}.om --loop 50 + # export bs=24 + python3 -m ais_bench --model models/vit-b-16.img.bs${bs}.om --loop 50 ``` @@ -298,10 +303,10 @@ Chinese_CLIP为CLIP模型的中文版本,使用大规模中文数据进行训 | 芯片型号 | Input Shape | 单次推理耗时 | | -------- | ----------- | ------------ | -| 300I Pro | 20 x 512 | 91.6ms | +| 300I Pro | 24 x 512 | 103ms | - 图像侧模型: | 芯片型号 | Batch Size | 单次推理耗时 | |----------|------------|----------| -| 300I Pro | 20 | 35ms | +| 300I Pro | 24 | 39ms | diff --git a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/aipp.config b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/aipp.config deleted file mode 100644 index e11e609a500bb1fc69860279c54e6e20d20239f3..0000000000000000000000000000000000000000 --- a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/aipp.config +++ /dev/null @@ -1,16 +0,0 @@ -aipp_op{ - aipp_mode:static - input_format : RGB888_U8 - - src_image_size_w : 224 - src_image_size_h : 224 - - crop: false - - min_chn_0 : 123.25239296 - min_chn_1 : 117.20384 - min_chn_2 : 104.50194688 - var_reci_chn_0: 0.0145414015152615 - var_reci_chn_1: 0.0149491443980385 - var_reci_chn_2: 0.014164518585317 -} \ No newline at end of file diff --git a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/cn_clip.patch b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/cn_clip.patch index 9541637028965c2c5448e48c4b866e0f82b8b89a..cd5f9278797672e559e49d7fa38b8ef0c1d75668 100644 --- a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/cn_clip.patch +++ b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/cn_clip.patch @@ -1,6 +1,6 @@ -diff -urN cn_clip/deploy/pytorch_to_onnx.py cn_clip_fix/deploy/pytorch_to_onnx.py ---- cn_clip/deploy/pytorch_to_onnx.py 2024-02-20 14:49:36.955252200 +0800 -+++ cn_clip_fix/deploy/pytorch_to_onnx.py 2024-03-17 14:48:30.339636500 +0800 +diff -Nur b/Chinese-CLIP/cn_clip/deploy/pytorch_to_onnx.py a/Chinese-CLIP/cn_clip/deploy/pytorch_to_onnx.py +--- b/Chinese-CLIP/cn_clip/deploy/pytorch_to_onnx.py 2024-08-07 10:13:21.636000000 +0000 ++++ a/Chinese-CLIP/cn_clip/deploy/pytorch_to_onnx.py 2024-08-07 01:17:34.820000000 +0000 @@ -117,8 +117,9 @@ text_fp32_onnx_path, input_names=['text'], @@ -24,3 +24,250 @@ diff -urN cn_clip/deploy/pytorch_to_onnx.py cn_clip_fix/deploy/pytorch_to_onnx.p verbose=True) # for ViT-H-14 FP32 model, make another conversion to deal with the generated small files if args.model_arch == "ViT-H-14": +diff -Nur b/Chinese-CLIP/cn_clip/eval/zeroshot_evaluation.py a/Chinese-CLIP/cn_clip/eval/zeroshot_evaluation.py +--- b/Chinese-CLIP/cn_clip/eval/zeroshot_evaluation.py 2024-08-07 10:13:21.636000000 +0000 ++++ a/Chinese-CLIP/cn_clip/eval/zeroshot_evaluation.py 2024-08-07 10:07:28.980000000 +0000 +@@ -10,7 +10,7 @@ + from tqdm import tqdm + + import torch +- ++from ais_bench.infer.interface import InferSession + from cn_clip.clip.model import convert_weights, CLIP + from cn_clip.clip import tokenize + from cn_clip.training.main import convert_models_to_fp32 +@@ -29,6 +29,16 @@ + def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( ++ "--text-om", ++ default="vit-b-text.om", ++ help="Name of the text om model to use.", ++ ) ++ parser.add_argument( ++ "--vision-om", ++ default="vit-b-image.om", ++ help="Name of the vision om model to use.", ++ ) ++ parser.add_argument( + "--vision-model", + choices=["ViT-B-32", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"], + default="ViT-B-16", +@@ -82,12 +92,12 @@ + # help="Path to imagenet val set for conducting zero shot evaluation.", + # ) + parser.add_argument( +- "--img-batch-size", type=int, default=64, help="Image batch size." ++ "--img-batch-size", type=int, default=24, help="Image batch size." + ) + parser.add_argument( + "--context-length", + type=int, +- default=52, ++ default=512, + help="The maximum length of input text (include [CLS] & [SEP] tokens)." + ) + parser.add_argument( +@@ -106,16 +116,24 @@ + + def zero_shot_classifier(model, classnames, templates, args): + with torch.no_grad(): ++ session_code = InferSession(0, model) + zeroshot_weights = [] + for classname in tqdm(classnames): + texts = [_preprocess_text(template(classname)) for template in templates] # format with class +- texts = tokenize(texts, context_length=args.context_length).to(args.gpu) # tokenize +- class_embeddings = model(None, texts) ++ texts = tokenize(texts, context_length=args.context_length) ++ x = torch.zeros(9,512) ++ texts = torch.concat((texts,x),dim=0) ++ res_class = [] ++ for i in range(8): ++ feed = [texts[i*24:(i+1)*24,].to(torch.int64)] ++ class_embeddings = session_code.infer(feed) ++ res_class.append(torch.from_numpy(class_embeddings[0])) ++ class_embeddings = torch.concat(res_class, dim=0)[:183,] + class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True) + class_embedding = class_embeddings.mean(dim=0) + class_embedding /= class_embedding.norm() + zeroshot_weights.append(class_embedding) +- zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.gpu) ++ zeroshot_weights = torch.stack(zeroshot_weights, dim=1) + return zeroshot_weights + + +@@ -128,27 +146,31 @@ + def run(model, classifier, dataloader, args): + total_logits = [] + total_targets = [] +- with torch.no_grad(): +- top1, top5, n = 0.0, 0.0, 0.0 +- for images, target in tqdm(dataloader): +- images = images.to(args.gpu) +- target = target.to(args.gpu) +- total_targets.append(target) +- +- # predict +- image_features = model(images, None) +- image_features /= image_features.norm(dim=-1, keepdim=True) +- logits = (100.0 * image_features @ classifier).softmax(dim=-1) +- total_logits.append(logits) +- +- # measure accuracy +- acc1, acc5 = accuracy(logits, target, topk=(1, 1)) +- top1 += acc1 +- n += images.size(0) ++ session_code = InferSession(0, model) ++ # with torch.no_grad(): ++ top1, top5, n = 0.0, 0.0, 0.0 ++ for images, target in tqdm(dataloader): ++ images = images ++ target = target ++ total_targets.append(target) ++ batch = images.size(0) ++ if batch != 24: ++ pad = torch.zeros_like(images[:(24-batch)]) ++ images = torch.concat((images,pad), dim=0) ++ feed = [images.to(torch.float32)] ++ image_features = session_code.infer(feed) ++ image_features = torch.from_numpy(image_features[0][:batch,]) ++ image_features /= image_features.norm(dim=-1, keepdim=True) ++ logits = (100.0 * image_features @ classifier).softmax(dim=-1) ++ total_logits.append(logits) ++ ++ # measure accuracy ++ acc1, acc5 = accuracy(logits, target, topk=(1, 1)) ++ top1 += acc1 ++ n += batch + + outputs = torch.cat(total_logits, dim=0) + targets = torch.cat(total_targets, dim=0) +- + if getattr(args, "index", ""): + print("Use index to rearrange the logits...") + with open(args.index, "r", encoding="utf-8") as f: +@@ -166,16 +188,14 @@ + if __name__ == "__main__": + args = parse_args() + ++ text_model = args.text_om ++ vision_model = args.vision_om + # Log params. + print("Params:") + for name in sorted(vars(args)): + val = getattr(args, name) + print(f" {name}: {val}") + +- args.gpu = 0 +- torch.cuda.set_device(args.gpu) +- +- # Initialize the model. + vision_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{args.vision_model.replace('/', '-')}.json" + print('Loading vision model config from', vision_model_config_file) + assert os.path.exists(vision_model_config_file) +@@ -191,17 +211,6 @@ + for k, v in json.load(ft).items(): + model_info[k] = v + +- model = CLIP(**model_info) +- convert_weights(model) +- +- # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372 +- if args.precision == "amp" or args.precision == "fp32": +- convert_models_to_fp32(model) +- model.cuda(args.gpu) +- if args.precision == "fp16": +- convert_weights(model) +- +- # Get eval data. + print("Preparing zeroshot dataset.") + data = {} + print(f"{model_info['image_resolution']}") +@@ -209,26 +218,8 @@ + args, image_transform(model_info["image_resolution"]) + ) + +- # Resume from a checkpoint. +- print("Begin to load model checkpoint from {}.".format(args.resume)) +- assert os.path.exists(args.resume), "The checkpoint file {} not exists!".format(args.resume) +- # Map model to be loaded to specified single gpu. +- loc = "cuda:{}".format(args.gpu) +- checkpoint = torch.load(args.resume, map_location='cpu') +- start_epoch = checkpoint["epoch"] +- sd = checkpoint["state_dict"] +- if next(iter(sd.items()))[0].startswith('module'): +- sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k} +- model.load_state_dict(sd) +- print( +- f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']} @ {checkpoint['step']} steps)" +- ) +- +- # Compute ensembled class embeddings + print('Building zero-shot classifier') + +- model.eval() +- + f = open(args.label_file, "r", encoding="utf8") + classnames = [line.strip() for line in f.readlines()] + +@@ -246,32 +237,10 @@ + else: + templates = template_dict['openai'] + +- # Make inference and evaluation + print('Using classifier') +- classifier = zero_shot_classifier(model, classnames, templates, args) ++ classifier = zero_shot_classifier(text_model, classnames, templates, args) + results = {} +- top1, logits = run(model, classifier, data[args.dataset].dataloader, args) +- +- def json_prec_dump(data, prec=6): +- return json.dumps( +- json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec)) +- ) +- +- print(logits.size()) +- output_dict = { +- "model_name": "CN-CLIP-" + args.vision_model, +- "dataset_name": args.dataset, +- "num_trainable_params": 0, +- "num_params": sum(x.numel() for x in model.parameters()), +- "num_visual_params": sum(x.numel() for x in model.visual.parameters()), +- "num_backbone_params": sum(x.numel() for x in model.parameters()), +- "n_shot": 0, +- "rnd_seeds": [123], +- "predictions": [logits.cpu().data.numpy().tolist()], +- } +- json_string = json_prec_dump(output_dict) +- with open(os.path.join(args.save_dir, f"{args.dataset}.json"), "w", encoding="utf-8") as w: +- w.write(json_string) ++ top1, logits = run(vision_model, classifier, data[args.dataset].dataloader, args) + + results["zeroshot-top1"] = top1 + +diff -Nur b/Chinese-CLIP/run_scripts/zeroshot_eval.sh a/Chinese-CLIP/run_scripts/zeroshot_eval.sh +--- b/Chinese-CLIP/run_scripts/zeroshot_eval.sh 2024-08-07 10:13:21.656000000 +0000 ++++ a/Chinese-CLIP/run_scripts/zeroshot_eval.sh 2024-08-07 10:07:22.108000000 +0000 +@@ -18,7 +18,9 @@ + text_model=${5} + resume=${6} + label_file=${path}/datasets/${dataset}/label_cn.txt +-index=${7:-} ++text_om=${7} ++vision_om=${8} ++index=${9:-} + + mkdir -p ${savedir} + +@@ -28,7 +30,9 @@ + --save-dir=${savedir} \ + --dataset=${dataset} \ + --index=${index} \ +- --img-batch-size=64 \ ++ --img-batch-size=24 \ + --resume=${resume} \ + --vision-model=${vision_model} \ +- --text-model=${text_model} ++ --text-model=${text_model} \ ++ --text-om=${text_om} \ ++ --vision-om=${vision_om} diff --git a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/compare.py b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/compare.py deleted file mode 100644 index 4df6ecfa8983e6ee693001777c44978b442d96e9..0000000000000000000000000000000000000000 --- a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/compare.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -import json -from PIL import Image -from pathlib import Path - -import torch -import numpy as np -from torchvision.transforms import Compose, Resize, InterpolationMode -from ais_bench.infer.interface import InferSession - -from cn_clip.clip.model import convert_weights, CLIP -import cn_clip.clip as clip -from cn_clip.training.main import convert_models_to_fp32 -from cn_clip.clip.utils import image_transform - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--vision-model", - choices=["ViT-B-32", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"], - default="ViT-B-16", - help="Name of the vision backbone to use.", - ) - parser.add_argument( - "--text-model", - choices=["RoBERTa-wwm-ext-base-chinese", "RoBERTa-wwm-ext-large-chinese", "RBT3-chinese"], - default="RoBERTa-wwm-ext-base-chinese", - help="Name of the text backbone to use.", - ) - parser.add_argument( - "--batch-size", type=int, default=20, help="Image batch size." - ) - parser.add_argument( - "--resume", - default="./models/clip_cn_vit-b-16.pt", - type=str, - help="path to latest checkpoint (default: none)", - ) - parser.add_argument( - "--npu-device", type=int, default=0, help="Npu device ID." - ) - parser.add_argument( - "--txt-model-path", - type=str, - default=None, - help="path to img om model." - ) - parser.add_argument( - "--img-model-path", - type=str, - default=None, - help="path to img om model." - ) - args = parser.parse_args() - - return args - - -def _convert_to_rgb(image): - return torch.tensor(np.array(image.convert('RGB')).astype("uint8")) - - -def image_transform_wo_normalize(image_size=224): - transform = Compose([ - Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), - _convert_to_rgb, - ]) - return transform - - -def cpu_infer(args, model_info, txt, img): - # get model - model = CLIP(**model_info) - # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372 - convert_weights(model) - convert_models_to_fp32(model) - # Resume from a checkpoint. - print("Begin to load model checkpoint from {}.".format(args.resume)) - assert os.path.exists(args.resume), "The checkpoint file {} not exists!".format(args.resume) - # Map model to be loaded to specified device. - checkpoint = torch.load(args.resume, map_location='cpu') - sd = checkpoint["state_dict"] - if next(iter(sd.items()))[0].startswith('module'): - sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k} - model.load_state_dict(sd) - model.eval() - - # cpu infer - with torch.no_grad(): - assert args.txt_model_path is not None or args.img_model_path is not None, "txt_model_path and img_model_path cannot both be None" - assert args.txt_model_path is None or args.img_model_path is None, "txt_model_path and img_model_path cannot both passing values" - if args.txt_model_path: - out = model(None, txt) - elif args.img_model_path: - out = model(img, None) - return out - - -def om_infer(args, txt, img): - assert args.txt_model_path is not None or args.img_model_path is not None, "txt_model_path and img_model_path cannot both be None" - assert args.txt_model_path is None or args.img_model_path is None, "txt_model_path and img_model_path cannot both passing values" - if args.txt_model_path: - session = InferSession(args.npu_device, args.txt_model_path) - out = session.infer([txt]) - elif args.img_model_path: - session = InferSession(args.npu_device, args.img_model_path) - out = session.infer([img]) - return torch.tensor(out[0]) - - -if __name__ == "__main__": - args = parse_args() - - # Get model config. - vision_model_config_file = Path(__file__).parent / \ - f"Chinese-CLIP/cn_clip/clip/model_configs/{args.vision_model.replace('/', '-')}.json" - assert os.path.exists(vision_model_config_file) - - text_model_config_file = Path(__file__).parent / \ - f"Chinese-CLIP/cn_clip/clip/model_configs/{args.text_model.replace('/', '-')}.json" - assert os.path.exists(text_model_config_file) - - with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft: - model_info = json.load(fv) - if isinstance(model_info['vision_layers'], str): - model_info['vision_layers'] = eval(model_info['vision_layers']) - for k, v in json.load(ft).items(): - model_info[k] = v - - # build inputs - text = clip.tokenize(["皮卡丘"], context_length=512).expand(args.batch_size, 512) - preprocess = image_transform(model_info["image_resolution"]) - image = preprocess(Image.open("Chinese-CLIP/examples/pokemon.jpeg")).unsqueeze(0).expand(args.batch_size,3,224,224) - preprocess = image_transform_wo_normalize(model_info["image_resolution"]) - image_wo_normalize = preprocess(Image.open("Chinese-CLIP/examples/pokemon.jpeg")).unsqueeze(0).expand(args.batch_size,224,224,3) - - x = cpu_infer(args, model_info, text, image) - y = om_infer(args, text, image_wo_normalize) - - similarity = torch.cosine_similarity(torch.tensor(x).reshape(-1), y.reshape(-1), dim=0) - print(f"模型输出的余弦相似度为:{similarity}") \ No newline at end of file diff --git a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_img_onnx.py b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_img_onnx.py index bb7a4878722879ce7c6b710c8bff6424b5b0995f..0d61ccac72d82209e31410e87423bb16a874c485 100644 --- a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_img_onnx.py +++ b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_img_onnx.py @@ -205,7 +205,7 @@ def pad_nz_block( padding_shape, original_shape = cal_padding_shape(graph, merged) axis = 0 if merged else 1 - new_concat_init = graph.add_initializer(f"padding_concat_init", np.zeros(padding_shape, dtype=np.float16)) + new_concat_init = graph.add_initializer(f"padding_concat_init", np.zeros(padding_shape, dtype=np.float32)) add_node = anchor_adds_2[0] new_concat_name = f"Concat_before_{add_node.name}" new_concat_node = graph.add_node(new_concat_name, "Concat", attrs={"axis": axis}) @@ -459,7 +459,7 @@ def adapt_for_attentionscore(graph: OnnxGraph, anchor_softmaxes: List[OnnxNode]) div_value = graph[div_before_matmul.inputs[1]].value new_mul_init = graph.add_initializer( f"{new_mul_name}_init", - np.array(1/div_value, dtype="float16") + np.array(1/div_value, dtype="float32") ) graph.insert_node(softmax_node.name, new_mul_node, mode="before") new_mul_node.inputs.append(new_mul_init.name) diff --git a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_txt_onnx.py b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_txt_onnx.py index 2653333a6d489bcb15a3f97352493fca2917e0a3..b265e98f9bd92bfe57acf2b39fd103d9b7b28e78 100644 --- a/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_txt_onnx.py +++ b/ACL_PyTorch/built-in/foundation_models/Chinese_CLIP/opt_txt_onnx.py @@ -27,20 +27,6 @@ def get_config(graph): def fix_attention_lnqkv(graph, qkv_start_node): - # insert reshape before qkv_start_node - reshape_before_add = graph.add_node( - f"Reshape_before_{qkv_start_node.name}", - "Reshape" - ) - reshape_init = graph.add_initializer( - f"{reshape_before_add.name}_value", - np.array([-1, HIDDEN_NUM], dtype="int64") - ) - if graph.get_node(qkv_start_node.inputs[0], node_type=Initializer): - graph.insert_node(qkv_start_node.name, reshape_before_add, refer_index=1, mode="before") - else: - graph.insert_node(qkv_start_node.name, reshape_before_add, refer_index=0, mode="before") - reshape_before_add.inputs.append(reshape_init.name) # change transpose node seen: List[List[int]] = [] @@ -96,7 +82,7 @@ def fix_attention_score(graph, softmax_node, bs, seq_len): f"bert_Mul_before_{add_node.name}", "Mul", ) - mul_init_value = np.array(1/div_init.value, dtype="float16") + mul_init_value = np.array(1/div_init.value, dtype="float32") mul_init = graph.add_initializer( f"{mul_node.name}_value", mul_init_value @@ -105,17 +91,6 @@ def fix_attention_score(graph, softmax_node, bs, seq_len): mul_node.inputs.append(mul_init.name) graph.remove(div_node.name) - expand_node = graph.add_node( - f"Expand_before_{add_node.name}", - "Expand" - ) - expand_init = graph.add_initializer( - f"{expand_node.name}_value", - np.array([bs, 1, seq_len, seq_len], dtype="int64") - ) - graph.insert_node(add_node.name, expand_node, refer_index=~refer_index, mode="before") - expand_node.inputs.append(expand_init.name) - def main(graph): # get config @@ -123,6 +98,24 @@ def main(graph): # fix_lnqkv add_nodes = graph.get_nodes("Add") + gather_node = graph.get_nodes("Gather")[0] + + # insert reshape before qkv_start_node + reshape_before_add = graph.add_node( + f"Reshape_2dims", + "Reshape" + ) + reshape_init = graph.add_initializer( + "Reshape_2dims_value", + np.array([-1, HIDDEN_NUM], dtype="int64") + ) + graph.insert_node(gather_node.name, reshape_before_add, mode="after") + + reshape_before_add.inputs.append(reshape_init.name) + + for add_node in add_nodes[:2]: + graph[add_node.inputs[1]].value = graph[add_node.inputs[1]].value.reshape(-1, HIDDEN_NUM) + for add_node in add_nodes: if len(graph.get_next_nodes(add_node.outputs[0])) == 4: fix_attention_lnqkv(graph, add_node) @@ -132,6 +125,22 @@ def main(graph): for softmax_node in softmax_nodes: fix_attention_score(graph, softmax_node, bs, seq_len) + # add expand node + expand_node = graph.add_node( + f"Expand_Mask", + "Expand" + ) + expand_init = graph.add_initializer( + f"expand_value", + np.array([bs, 1, seq_len, seq_len], dtype="int64") + ) + s_node = softmax_nodes[0] + a_node = graph.get_prev_node(s_node.inputs[0]) + m_node = graph.get_prev_node(a_node.inputs[1]) + expand_node.inputs=["mul_out", "expand_value"] + expand_node.outputs=[m_node.outputs[0]] + m_node.outputs=["mul_out"] + # insert last reshape to recover shape last_add = graph.get_nodes(op_type="Add")[-1] last_reshape = graph.add_node( @@ -142,10 +151,7 @@ def main(graph): f"{last_reshape.name}_value", np.array([bs, seq_len, HIDDEN_NUM], dtype="int64") ) - if graph.get_node(last_add.inputs[0], node_type=Initializer): - graph.insert_node(last_add.name, last_reshape, refer_index=1, mode="before") - else: - graph.insert_node(last_add.name, last_reshape, refer_index=0, mode="before") + graph.insert_node(last_add.name, last_reshape, mode="after") last_reshape.inputs.append(reshape_init.name)