diff --git a/models/cv/classification/clip/ixformer/README.md b/models/cv/classification/clip/ixformer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b43ffada6dc50dadd1d13234d0ba468f8f4b82c
--- /dev/null
+++ b/models/cv/classification/clip/ixformer/README.md
@@ -0,0 +1,40 @@
+# CLIP
+
+## Description
+
+CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet zero-shot without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision.
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install -U transformers==4.27.1
+```
+
+### Download
+
+Pretrained model: Go to the website <https://huggingface.co/models> to find the pre-trained model you need. Here, we choose clip-vit-base-patch32.
+
+```bash
+# Download model from the website and make sure the model's path is "/home/data/openai/clip-vit-base-patch32"
+mkdir -p /data
+unzip clip-vit-base-patch32.zip
+```
+
+## Run model
+
+### Test using the OpenAI interface
+
+Please modify the part in the test_clip.py script that pertains to the model path.
+```bash
+python3 test_clip.py
+```
diff --git a/models/cv/classification/clip/ixformer/inference.py b/models/cv/classification/clip/ixformer/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..013c96e8cd5123312751021a3ae0ee898515826e
--- /dev/null
+++ b/models/cv/classification/clip/ixformer/inference.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import time
+
+import requests
+import torch
+
+# from transformers import CLIPModel
+from ixformer.inference.models.clip import CLIPModel
+from PIL import Image
+from torch.cuda import profiler
+from transformers import CLIPProcessor
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = (
+    CLIPModel.from_pretrained("/home/data/openai/clip-vit-base-patch32")
+    .to(device)
+    .half()
+)
+model = model.eval()
+processor = CLIPProcessor.from_pretrained("/home/data/openai/clip-vit-base-patch32")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+batch_size_list = [32, 64, 128, 256, 512, 1024, 2048]
+with torch.no_grad():
+    for batch_size in batch_size_list:
+        images = [image for item in range(batch_size)]
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"],
+            images=images,
+            return_tensors="pt",
+            padding=True,
+        )
+        inputs["input_ids"] = inputs["input_ids"].to(device)
+        inputs["attention_mask"] = inputs["attention_mask"].to(device)
+        inputs["pixel_values"] = inputs["pixel_values"].to(device).half()
+        # warmup
+        for i in range(2):
+            outputs = model(**inputs)
+        torch.cuda.synchronize()
+        profiler.start()
+        start_time = time.perf_counter()
+        outputs = model(**inputs)
+        profiler.stop()
+        torch.cuda.synchronize()
+        end_time = time.perf_counter()
+        logits_per_image = (
+            outputs.logits_per_image
+        )  # this is the image-text similarity score
+        probs = logits_per_image.softmax(
+            dim=1
+        )  # we can take the softmax to get the label probabilities
+        print(probs[:5])
+        print(probs[-5:-1])
+
+        print("QPS: ", batch_size / (end_time - start_time))
\ No newline at end of file