From fdf9736a3f4891e26ebff7a8345719b71d67eb2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=8B=E5=BB=BA=E6=9E=97?= Date: Fri, 22 Nov 2024 15:48:01 +0000 Subject: [PATCH] =?UTF-8?q?add=20align=5Flunwenfuxian.py.=20=E4=BD=BF?= =?UTF-8?q?=E7=94=A8mindNLP=E5=A4=8D=E7=8E=B0align=E8=AE=BA=E6=96=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 宋建林 --- align_lunwenfuxian.py | 82 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 align_lunwenfuxian.py diff --git a/align_lunwenfuxian.py b/align_lunwenfuxian.py new file mode 100644 index 0000000..cd27612 --- /dev/null +++ b/align_lunwenfuxian.py @@ -0,0 +1,82 @@ +import mindspore as ms +from mindnlp.transformers import AlignModel, AlignProcessor +from mindspore import Tensor +import numpy as np +from PIL import Image +from pycocotools.coco import COCO +import os +from tqdm import tqdm # 用于显示进度条 + +# Set the context to use CPU (or GPU if available) +ms.set_context(mode=ms.GRAPH_MODE, device_target="CPU") # or "GPU" if you have a GPU + +# Step 1: Specify the model name (Hugging Face's align model) +model_name = "kakaobrain/align-base" + +# Step 2: Load the processor +processor = AlignProcessor.from_pretrained(model_name) + +# Step 3: Load the model +model = AlignModel.from_pretrained(model_name) + +# Step 4: Load the MSCOCO dataset +dataDir = 'E:\Code\Dataset\MSCOCO' # Replace with the path to your MSCOCO dataset +dataType = 'val2017' +annFile = f'{dataDir}/annotations/captions_{dataType}.json' +coco = COCO(annFile) + +# Step 5: Prepare input data +def get_image_and_caption(coco, img_id, dataDir): + ann_ids = coco.getAnnIds(imgIds=img_id) + anns = coco.loadAnns(ann_ids) + caption = anns[0]['caption'] # Use the first caption for simplicity + img_info = coco.loadImgs(img_id)[0] + img_path = os.path.join(dataDir, dataType, img_info['file_name']) + image = Image.open(img_path) + if image.mode != "RGB": + image = image.convert("RGB") + return image, caption + +# Step 6: Evaluate the model +def evaluate_model(coco, model, processor, dataDir, num_samples=1000): + img_ids = coco.getImgIds() + image_embeds_list = [] + text_embeds_list = [] + + # Use tqdm to show progress + for img_id in tqdm(img_ids[:num_samples], desc="Evaluating"): # Evaluate on a subset of the dataset + image, caption = get_image_and_caption(coco, img_id, dataDir) + inputs = processor(text=caption, images=image, return_tensors="np") + input_ids = Tensor(inputs["input_ids"].astype(np.int64)) + attention_mask = Tensor(inputs["attention_mask"].astype(np.int64)) + pixel_values = Tensor(inputs["pixel_values"]) + output = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values) + + # Collect embeddings + image_embeds_list.append(output.image_embeds.asnumpy()) + text_embeds_list.append(output.text_embeds.asnumpy()) + + # Convert to numpy arrays + image_embeds = np.vstack(image_embeds_list) + text_embeds = np.vstack(text_embeds_list) + + # Calculate I2T and T2I R@1 + i2t_r1 = calculate_recall(image_embeds, text_embeds, k=1) + t2i_r1 = calculate_recall(text_embeds, image_embeds, k=1) + + print(f"MSCOCO I2T R@1: {i2t_r1}") + print(f"MSCOCO T2I R@1: {t2i_r1}") + +def calculate_recall(query_embeds, gallery_embeds, k=1): + # Calculate cosine similarity + query_embeds = query_embeds / np.linalg.norm(query_embeds, axis=1, keepdims=True) + gallery_embeds = gallery_embeds / np.linalg.norm(gallery_embeds, axis=1, keepdims=True) + similarity = np.dot(query_embeds, gallery_embeds.T) + # Get the top-k indices + top_k_indices = np.argsort(-similarity, axis=1)[:, :k] + # Check if the correct match is in the top-k + recall = np.mean(np.any(top_k_indices == np.arange(len(query_embeds))[:, None], axis=1)) + return recall + +# Step 7: Run the evaluation +evaluate_model(coco, model, processor, dataDir, num_samples=1000) -- Gitee