diff --git a/README.md b/README.md
index a92b22a32f0390f7d8cb860bbadfa7247c1f24d9..c9932f74103ab87db67e8d4cc1b75d5bf9c000eb 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
ConvNeXt-Base |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
@@ -90,7 +90,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
CSPResNet50 |
FP16 |
- - |
+ Supported |
Supported |
@@ -102,7 +102,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
DeiT-tiny |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
@@ -146,7 +146,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
DenseNet201 |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
@@ -197,6 +197,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- |
+
+ EfficientNet-B4 |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
EfficientNetV2 |
FP16 |
@@ -212,7 +223,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
EfficientNetv2_rw_t |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
@@ -274,6 +285,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
Supported |
+
+ Mixer_B |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
MNASNet0_5 |
FP16 |
@@ -285,6 +307,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- |
+
+ MNASNet0_75 |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
MobileNetV2 |
FP16 |
@@ -329,6 +362,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- |
+
+ RegNet_x_16gf |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
RegNet_x_1_6gf |
FP16 |
@@ -472,6 +516,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- |
+
+ ResNeXt101_32x8d |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
SEResNet50 |
FP16 |
@@ -528,26 +583,48 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- SqueezeNet 1.0 |
+ ShuffleNetV2_x2_0 |
FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
- |
- Supported |
+
+
+ SqueezeNet 1.0 |
+ FP16 |
+ Supported |
+ Supported |
INT8 |
- |
- Supported |
+ Supported |
SqueezeNet 1.1 |
FP16 |
- |
- Supported |
+ Supported |
+
+
+ INT8 |
+ - |
+ Supported |
+
+
+ SVT Base |
+ FP16 |
+ Supported |
+ - |
INT8 |
- |
- Supported |
+ - |
Swin Transformer |
@@ -571,6 +648,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- |
+
+ VGG11 |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
VGG16 |
FP16 |
@@ -593,6 +681,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
Supported |
Supported |
+
+ Wide ResNet101 |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
### Detection
@@ -652,7 +751,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
FoveaBox |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
@@ -663,7 +762,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
FSAF |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
@@ -674,7 +773,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
HRNet |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
@@ -725,6 +824,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- |
+
+ SABL |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
YOLOv3 |
FP16 |
@@ -824,6 +934,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- |
+
+ YOLOv11 |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
YOLOX |
FP16 |
@@ -867,29 +988,24 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
Models |
Precision |
IGIE |
- IxRT |
Kie_layoutXLM |
FP16 |
Supported |
- - |
INT8 |
- |
- - |
SVTR |
FP16 |
Supported |
- - |
INT8 |
- |
- - |
@@ -902,6 +1018,17 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
IGIE |
IxRT |
+
+ HRNetPose |
+ FP16 |
+ Supported |
+ - |
+
+
+ INT8 |
+ - |
+ - |
+
Lightweight OpenPose |
FP16 |
@@ -1026,209 +1153,236 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
-## NLP
-
-### Language Modelling
+## LLM (Large Language Model)
-### Large Language Model
+## Multimodal
+
+## NLP
+
+### Language Modelling
+
+
- Baichuan2-7B |
- Supported |
- - |
+ Models |
+ Precision |
+ IGIE |
+ IxRT |
+
+
+ ALBERT |
+ FP16 |
- |
+ Supported |
- ChatGLM-3-6B |
- Supported |
+ INT8 |
- |
- |
- ChatGLM-3-6B-32K |
- Supported |
+ BERT Base NER |
+ FP16 |
- |
- |
- Llama2-7B |
- Supported |
- Supported |
+ INT8 |
+ Supported |
- |
- Llama2-13B |
- - |
- Supported |
- - |
+ BERT Base SQuAD |
+ FP16 |
+ Supported |
+ Supported |
- Llama2-70B |
- - |
- Supported |
+ INT8 |
- |
+ Supported |
- Llama3-70B |
- Supported |
- - |
- - |
+ BERT Large SQuAD |
+ FP16 |
+ Supported |
+ Supported |
- MiniCPM-V-2 |
- Supported |
- - |
- - |
+ INT8 |
+ Supported |
+ Supported |
- Qwen-7B |
- Supported |
+ DeBERTa |
+ FP16 |
- |
- - |
+ Supported |
- Qwen1.5-7B |
- Supported |
+ INT8 |
+ - |
- |
- Supported |
- Qwen1.5-14B |
- Supported |
- - |
+ RoBERTa |
+ FP16 |
- |
+ Supported |
- Qwen1.5-32B Chat |
- Supported |
+ INT8 |
- |
- |
- Qwen1.5-72B |
- Supported |
- - |
+ RoFormer |
+ FP16 |
- |
+ Supported |
- Qwen2-7B Instruct |
- Supported |
+ INT8 |
- |
- |
- Qwen2-72B Instruct |
- Supported |
- - |
+ VideoBERT |
+ FP16 |
- |
+ Supported |
- StableLM2-1.6B |
- Supported |
+ INT8 |
- |
- |
diff --git a/models/cv/classification/deit_tiny/ixrt/README.md b/models/cv/classification/deit_tiny/ixrt/README.md
index 1ceea7bfbbb8bb888cc0cb2a14bc5de4773962fa..15fb025e9badbdb2f303f13a977675638489c90e 100644
--- a/models/cv/classification/deit_tiny/ixrt/README.md
+++ b/models/cv/classification/deit_tiny/ixrt/README.md
@@ -70,4 +70,4 @@ bash scripts/infer_deit_tiny_fp16_performance.sh
## Reference
-Deit_tiny:
\ No newline at end of file
+Deit_tiny:
diff --git a/models/cv/classification/vgg11/igie/README.md b/models/cv/classification/vgg11/igie/README.md
index 0206d951f48fc2be828725a66323111e85f0565f..522ff3e7a108eedb95cb61074ac1b90ebb8d027c 100644
--- a/models/cv/classification/vgg11/igie/README.md
+++ b/models/cv/classification/vgg11/igie/README.md
@@ -43,4 +43,4 @@ bash scripts/infer_vgg11_fp16_performance.sh
Model |BatchSize |Precision |FPS |Top-1(%) |Top-5(%)
--------|-----------|----------|----------|----------|--------
-VGG11 | 32 | FP16 | 3872.86 | 69.03 | 88.6
\ No newline at end of file
+VGG11 | 32 | FP16 | 3872.86 | 69.03 | 88.6
diff --git a/models/cv/classification/wide_resnet101/igie/README.md b/models/cv/classification/wide_resnet101/igie/README.md
index a72eeb1c48d770576d8e2d8bf00ac28cd7f4e404..93a5a3b8daa1d18ba43053f76ca90bdcb0b31bad 100644
--- a/models/cv/classification/wide_resnet101/igie/README.md
+++ b/models/cv/classification/wide_resnet101/igie/README.md
@@ -43,4 +43,4 @@ bash scripts/infer_wide_resnet101_fp16_performance.sh
| Model | BatchSize | Precision | FPS | Top-1(%) | Top-5(%) |
| -------------- | --------- | --------- | -------- | -------- | -------- |
-| Wide ResNet101 | 32 | FP16 | 1339.037 | 78.459 | 94.052 |
\ No newline at end of file
+| Wide ResNet101 | 32 | FP16 | 1339.037 | 78.459 | 94.052 |
diff --git a/models/cv/detection/sabl/igie/README.md b/models/cv/detection/sabl/igie/README.md
index 975e72daaef46a7c36d31cc4992a4524b2c7084a..28d0242e2905f30d1f520a3d8f4245fe86ccf5f5 100644
--- a/models/cv/detection/sabl/igie/README.md
+++ b/models/cv/detection/sabl/igie/README.md
@@ -27,6 +27,7 @@ Dataset: to download the valida
```bash
wget https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth
```
+
### Model Conversion
```bash
diff --git a/models/cv/pose_estimation/hrnetpose/igie/README.md b/models/cv/pose_estimation/hrnetpose/igie/README.md
index 1785d1f7363a5379fce907fdcb19316399ff5850..c4f0758fd794e3d6a553955697b9334929d99b65 100644
--- a/models/cv/pose_estimation/hrnetpose/igie/README.md
+++ b/models/cv/pose_estimation/hrnetpose/igie/README.md
@@ -5,6 +5,7 @@
HRNetPose (High-Resolution Network for Pose Estimation) is a high-performance human pose estimation model introduced in the paper "Deep High-Resolution Representation Learning for Human Pose Estimation". It is designed to address the limitations of traditional methods by maintaining high-resolution feature representations throughout the network, enabling more accurate detection of human keypoints.
## Setup
+
### Install
```bash
@@ -18,6 +19,7 @@ pip3 install -r requirements.txt
```
### Download
+
Pretrained model:
Dataset: to download the validation dataset.
@@ -27,6 +29,7 @@ wget https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192
```
### Model Conversion
+
```bash
# export onnx model
python3 export.py --weight hrnet_w32_coco_256x192-c78dce93_20200708.pth --cfg td-hm_hrnet-w32_8xb64-210e_coco-256x192.py --input 1,3,256,192 --output hrnetpose.onnx
@@ -58,4 +61,4 @@ bash scripts/infer_hrnetpose_fp16_performance.sh
## Reference
-mmpose:
\ No newline at end of file
+mmpose:
diff --git a/models/vision-language-understanding/chameleon-7b/vllm/README.md b/models/multimodal/vision_language_understanding/chameleon_7b/vllm/README.md
similarity index 99%
rename from models/vision-language-understanding/chameleon-7b/vllm/README.md
rename to models/multimodal/vision_language_understanding/chameleon_7b/vllm/README.md
index bebd7c799c66db9bc8ae7fe093ea945d34bbdfca..568dbd7011ff4bf157b4a0e2ee027729f192f9a0 100755
--- a/models/vision-language-understanding/chameleon-7b/vllm/README.md
+++ b/models/multimodal/vision_language_understanding/chameleon_7b/vllm/README.md
@@ -2,7 +2,7 @@
## Description
-Chameleon, an AI system that mitigates these limitations by augmenting LLMs with plug-and-play modules for compositional reasoning. Chameleon synthesizes programs by composing various tools (e.g., LLMs, off-the-shelf vision models, web search engines, Python functions, and heuristic-based modules) for accomplishing complex reasoning tasks. At the heart of Chameleon is an LLM-based planner that assembles a sequence of tools to execute to generate the final response. We showcase the effectiveness of Chameleon on two multi-modal knowledge-intensive reasoning tasks: ScienceQA and TabMWP. Chameleon, powered by GPT-4, achieves an 86.54% overall accuracy on ScienceQA, improving the best published few-shot result by 11.37%. On TabMWP, GPT-4-powered Chameleon improves the accuracy by 17.0%, lifting the state of the art to 98.78%. Our analysis also shows that the GPT-4-powered planner exhibits more consistent and rational tool selection via inferring potential constraints from instructions, compared to a ChatGPT-powered planner.
+Chameleon, an AI system that mitigates these limitations by augmenting LLMs with plug-and-play modules for compositional reasoning. Chameleon synthesizes programs by composing various tools (e.g., LLMs, off-the-shelf vision models, web search engines, Python functions, and heuristic-based modules) for accomplishing complex reasoning tasks. At the heart of Chameleon is an LLM-based planner that assembles a sequence of tools to execute to generate the final response. We showcase the effectiveness of Chameleon on two multi-modal knowledge-intensive reasoning tasks: ScienceQA and TabMWP. Chameleon, powered by GPT-4, achieves an 86.54% overall accuracy on ScienceQA, improving the best published few-shot result by 11.37%. On TabMWP, GPT-4-powered Chameleon improves the accuracy by 17.0%, lifting the state of the art to 98.78%. Our analysis also shows that the GPT-4-powered planner exhibits more consistent and rational tool selection via inferring potential constraints from instructions, compared to a ChatGPT-powered planner.
## Setup
@@ -32,4 +32,4 @@ mkdir data
```bash
export VLLM_ASSETS_CACHE=../vllm/
python3 offline_inference_vision_language.py --model ./data/chameleon-7b --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
-```
\ No newline at end of file
+```
diff --git a/models/vision-language-understanding/chameleon-7b/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_understanding/chameleon_7b/vllm/offline_inference_vision_language.py
similarity index 100%
rename from models/vision-language-understanding/chameleon-7b/vllm/offline_inference_vision_language.py
rename to models/multimodal/vision_language_understanding/chameleon_7b/vllm/offline_inference_vision_language.py
diff --git a/models/vision-language-understanding/chameleon-7b/vllm/utils.py b/models/multimodal/vision_language_understanding/chameleon_7b/vllm/utils.py
similarity index 100%
rename from models/vision-language-understanding/chameleon-7b/vllm/utils.py
rename to models/multimodal/vision_language_understanding/chameleon_7b/vllm/utils.py
diff --git a/models/vision-language-understanding/Intern_VL/vllm/vllm_public_assets/cherry_blossom.jpg b/models/multimodal/vision_language_understanding/chameleon_7b/vllm/vllm_public_assets/cherry_blossom.jpg
similarity index 100%
rename from models/vision-language-understanding/Intern_VL/vllm/vllm_public_assets/cherry_blossom.jpg
rename to models/multimodal/vision_language_understanding/chameleon_7b/vllm/vllm_public_assets/cherry_blossom.jpg
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/README.md b/models/multimodal/vision_language_understanding/fuyu_8b/vllm/README.md
similarity index 98%
rename from models/vision-language-understanding/fuyu-8b/vllm/README.md
rename to models/multimodal/vision_language_understanding/fuyu_8b/vllm/README.md
index 7bc5d2cc04268a63d6dd1bebb4e040f55f80be4d..96d5acc3b208b72de34624acb74849a7536670d0 100755
--- a/models/vision-language-understanding/fuyu-8b/vllm/README.md
+++ b/models/multimodal/vision_language_understanding/fuyu_8b/vllm/README.md
@@ -1,4 +1,4 @@
-# FuyuForCausalLM
+# Fuyu-8B
## Description
@@ -34,4 +34,4 @@ mkdir data
```bash
export VLLM_ASSETS_CACHE=../vllm/
python3 offline_inference_vision_language.py --model ./data/fuyu-8b --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
-```
\ No newline at end of file
+```
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_understanding/fuyu_8b/vllm/offline_inference_vision_language.py
similarity index 100%
rename from models/vision-language-understanding/fuyu-8b/vllm/offline_inference_vision_language.py
rename to models/multimodal/vision_language_understanding/fuyu_8b/vllm/offline_inference_vision_language.py
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/utils.py b/models/multimodal/vision_language_understanding/fuyu_8b/vllm/utils.py
similarity index 100%
rename from models/vision-language-understanding/fuyu-8b/vllm/utils.py
rename to models/multimodal/vision_language_understanding/fuyu_8b/vllm/utils.py
diff --git a/models/vision-language-understanding/LLava/vllm/vllm_public_assets/cherry_blossom.jpg b/models/multimodal/vision_language_understanding/fuyu_8b/vllm/vllm_public_assets/cherry_blossom.jpg
similarity index 100%
rename from models/vision-language-understanding/LLava/vllm/vllm_public_assets/cherry_blossom.jpg
rename to models/multimodal/vision_language_understanding/fuyu_8b/vllm/vllm_public_assets/cherry_blossom.jpg
diff --git a/models/vision-language-understanding/Intern_VL/vllm/README.md b/models/multimodal/vision_language_understanding/intern_vl/vllm/README.md
similarity index 98%
rename from models/vision-language-understanding/Intern_VL/vllm/README.md
rename to models/multimodal/vision_language_understanding/intern_vl/vllm/README.md
index 0b09f06eaef8da47b0f9fced467f6d34770191e7..cafe64f99ad80d3602a3ba3f478ae6b1f8cdb3fb 100644
--- a/models/vision-language-understanding/Intern_VL/vllm/README.md
+++ b/models/multimodal/vision_language_understanding/intern_vl/vllm/README.md
@@ -6,7 +6,7 @@ InternVL2-4B is a large-scale multimodal model developed by WeTab AI, designed t
## Setup
-### Instal
+### Install
In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
@@ -22,6 +22,7 @@ pip3 install vllm
pip3 install triton
pip3 install ixformer
```
+
### Download
-Model:
diff --git a/models/vision-language-understanding/Intern_VL/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_understanding/intern_vl/vllm/offline_inference_vision_language.py
similarity index 100%
rename from models/vision-language-understanding/Intern_VL/vllm/offline_inference_vision_language.py
rename to models/multimodal/vision_language_understanding/intern_vl/vllm/offline_inference_vision_language.py
diff --git a/models/vision-language-understanding/Intern_VL/vllm/utils.py b/models/multimodal/vision_language_understanding/intern_vl/vllm/utils.py
similarity index 100%
rename from models/vision-language-understanding/Intern_VL/vllm/utils.py
rename to models/multimodal/vision_language_understanding/intern_vl/vllm/utils.py
diff --git a/models/vision-language-understanding/chameleon-7b/vllm/vllm_public_assets/cherry_blossom.jpg b/models/multimodal/vision_language_understanding/intern_vl/vllm/vllm_public_assets/cherry_blossom.jpg
similarity index 100%
rename from models/vision-language-understanding/chameleon-7b/vllm/vllm_public_assets/cherry_blossom.jpg
rename to models/multimodal/vision_language_understanding/intern_vl/vllm/vllm_public_assets/cherry_blossom.jpg
diff --git a/models/vision-language-understanding/LLava/vllm/README.md b/models/multimodal/vision_language_understanding/llava/vllm/README.md
similarity index 99%
rename from models/vision-language-understanding/LLava/vllm/README.md
rename to models/multimodal/vision_language_understanding/llava/vllm/README.md
index 1b805f41751576d165f5aa5d5ad1259abd5a20f1..bbc251d2b3766dde38214515217f0faf75085d12 100644
--- a/models/vision-language-understanding/LLava/vllm/README.md
+++ b/models/multimodal/vision_language_understanding/llava/vllm/README.md
@@ -4,7 +4,6 @@
LLaVA is an open-source chatbot trained by fine-tuning LLaMA/Vicuna on GPT-generated multimodal instruction-following data. It is an auto-regressive language model, based on the transformer architecture.The LLaVA-NeXT model was proposed in LLaVA-NeXT: Improved reasoning, OCR, and world knowledge by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon LLaVa-1.5 by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning.
-
## Setup
### Install
@@ -24,7 +23,6 @@ pip3 install transformers
-llava-v1.6-vicuna-7b-hf:
-
```bash
# Download model from the website and make sure the model's path is "data/llava"
mkdir data
@@ -39,11 +37,10 @@ export PATH=/usr/local/corex/bin:${PATH}
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
```
-
### Inference llava-1.6
```bash
export VLLM_ASSETS_CACHE=../vllm/
export CUDA_VISIBLE_DEVICES=0,1,2,3
python3 offline_inference_vision_language.py --model /path/to/model --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --model-type llava-next --max-model-len 4096
-```
\ No newline at end of file
+```
diff --git a/models/vision-language-understanding/LLava/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_understanding/llava/vllm/offline_inference_vision_language.py
similarity index 100%
rename from models/vision-language-understanding/LLava/vllm/offline_inference_vision_language.py
rename to models/multimodal/vision_language_understanding/llava/vllm/offline_inference_vision_language.py
diff --git a/models/vision-language-understanding/LLava/vllm/utils.py b/models/multimodal/vision_language_understanding/llava/vllm/utils.py
similarity index 100%
rename from models/vision-language-understanding/LLava/vllm/utils.py
rename to models/multimodal/vision_language_understanding/llava/vllm/utils.py
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/vllm_public_assets/cherry_blossom.jpg b/models/multimodal/vision_language_understanding/llava/vllm/vllm_public_assets/cherry_blossom.jpg
similarity index 100%
rename from models/vision-language-understanding/fuyu-8b/vllm/vllm_public_assets/cherry_blossom.jpg
rename to models/multimodal/vision_language_understanding/llava/vllm/vllm_public_assets/cherry_blossom.jpg
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/README.md b/models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/README.md
similarity index 97%
rename from models/vision-language-understanding/llava_next_video-7b/vllm/README.md
rename to models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/README.md
index a50af3a220158968f5de39e48aed3bf61362e011..bf4b268310243ce6b51b23efdbf0d97aed5a573c 100755
--- a/models/vision-language-understanding/llava_next_video-7b/vllm/README.md
+++ b/models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/README.md
@@ -1,4 +1,4 @@
-# LLaVA-Next-Video
+# LLaVA-Next-Video-7B
## Description
@@ -32,4 +32,4 @@ mkdir data
```bash
export VLLM_ASSETS_CACHE=../vllm/
python3 offline_inference_vision_language.py --model ./data/LLaVA-NeXT-Video-7B-hf --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --model-type llava-next-video --modality video --dtype bfloat16
-```
\ No newline at end of file
+```
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/offline_inference_vision_language.py
similarity index 100%
rename from models/vision-language-understanding/llava_next_video-7b/vllm/offline_inference_vision_language.py
rename to models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/offline_inference_vision_language.py
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/utils.py b/models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/utils.py
similarity index 100%
rename from models/vision-language-understanding/llava_next_video-7b/vllm/utils.py
rename to models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/utils.py
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/video-eample-data/sample_demo_1.mp4 b/models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4
similarity index 100%
rename from models/vision-language-understanding/llava_next_video-7b/vllm/video-eample-data/sample_demo_1.mp4
rename to models/multimodal/vision_language_understanding/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4
diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md b/models/multimodal/vision_language_understanding/minicpm_v_2/vllm/README.md
similarity index 79%
rename from models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md
rename to models/multimodal/vision_language_understanding/minicpm_v_2/vllm/README.md
index 2dc49881226176329767fcf52f9d0742a4912056..149f01f15dac0c9b701b1f0cba9ad41bc242da9e 100644
--- a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md
+++ b/models/multimodal/vision_language_understanding/minicpm_v_2/vllm/README.md
@@ -1,8 +1,8 @@
-# MiniCPM-V-2
+# MiniCPM V2
## Description
-MiniCPM-V-2 is a compact and efficient language model designed for various natural language processing (NLP) tasks. Building on its predecessor, MiniCPM-V-1, this model integrates advancements in architecture and optimization techniques, making it suitable for deployment in resource-constrained environments.s
+MiniCPM V2 is a compact and efficient language model designed for various natural language processing (NLP) tasks. Building on its predecessor, MiniCPM-V-1, this model integrates advancements in architecture and optimization techniques, making it suitable for deployment in resource-constrained environments.s
## Setup
diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh b/models/multimodal/vision_language_understanding/minicpm_v_2/vllm/ci/prepare.sh
similarity index 100%
rename from models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh
rename to models/multimodal/vision_language_understanding/minicpm_v_2/vllm/ci/prepare.sh
diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py b/models/multimodal/vision_language_understanding/minicpm_v_2/vllm/minicpmv-2.0-offline.py
similarity index 100%
rename from models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
rename to models/multimodal/vision_language_understanding/minicpm_v_2/vllm/minicpmv-2.0-offline.py