From 8e5bd2aa1d4b00b1e9c9109cb09cdca1ce71ac3a Mon Sep 17 00:00:00 2001
From: Jiejing Zhang <kzjeef@gmail.com>
Date: Fri, 23 May 2025 17:57:00 +0800
Subject: [PATCH] mm: add qwen vl2.5 model support.

- add qwen vl 2.5 model support.
- Qwen VL2.5 only support 'transformers' as vit engine, (trt not support
  yet.)
- upgrade package version to make sure VL2.5 code is added.

test command:

server:
`dashinfer_vlm_serve --model qwen/Qwen2.5-VL-3B-Instruct --vision_engine transformers --port 8000 --host=127.0.0.1`

client:
```
curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d \ '{"model": "qwen/Qwen2.5-VL-3B-Instruct", "messages": [{"role": "user", "content": [{ "type": "text", "text": "Describe the image." }, {"type": "image_url", "image_url": {"url": "https://farm4.staticflickr.com/3075/3168662394_7d7103de7d_z_d.jpg"}}]}], "max_completion_tokens": 1024, "top_p": 0.5, "temperature": 0.1, "frequency_penalty": 1.05 }'
```

result:
```
{"id":"chatcmpl-rxqDiCQEJweEeeB7FADiER","object":"chat.completion",
"created":1747992522,"model":"model","choices":[{"index":0,"message":{"role":"assistant","content":"The
image features a small hummingbird perched on a branch. The bird is positioned in the center of the scene,
with its vibrant colors and delicate features clearly visible. The hummingbird appears to be enjoying its
time in nature, possibly searching for food or simply resting on the branch. \n\nThere are no other birds
or animals present in the image, making it a solitary moment captured in this natural
setting."},"finish_reason":"stop"}],"usage":{"prompt_tokens":382,"total_tokens":95,"completion_tokens":81}}
```
---
 .../vl_inference/utils/model_loader.py        | 102 +++++++++++++-----
 multimodal/requirements.txt                   |  10 +-
 2 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py b/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py
index 8af2ed253..f6f6aaa41 100644
--- a/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py
+++ b/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py
@@ -6,10 +6,14 @@
 import torch
 import glob
 import warnings
-from modelscope import snapshot_download
-from transformers import Qwen2VLForConditionalGeneration, AutoConfig, AutoTokenizer
-from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
 from tqdm import tqdm
+
+from transformers import AutoConfig, AutoTokenizer, AutoProcessor
+
+from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+
 from safetensors.torch import safe_open
 from dashinfer import allspark
 from dashinfer.allspark.model_loader import HuggingFaceModel, ModelSerializerException
@@ -59,25 +63,58 @@ def load_model(
             # the open-source model can be loaded by huggingface
             try:
                 if not os.path.isdir(self.hf_model_path):
+                    from modelscope import snapshot_download
                     self.hf_model_path = snapshot_download(self.hf_model_path)
-                self.torch_model = Qwen2VLForConditionalGeneration.from_pretrained(
-                    self.hf_model_path,
-                    trust_remote_code=self.trust_remote_code,
-                    torch_dtype=dtype_to_torch_dtype(self.data_type),
-                    device_map="cpu",
-                    **kwargs,
-                ).eval()
-                self.vit_config = Qwen2VLVisionConfig.from_pretrained(
-                    self.hf_model_path,
-                    trust_remote_code=True,
-                    revision=None,
-                    code_revision=None,
-                )
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    self.hf_model_path,
-                    trust_remote_code=self.trust_remote_code,
-                    **kwargs,
+
+                # Read config to determine model architecture
+                self.hf_model_config = AutoConfig.from_pretrained(
+                    self.hf_model_path, trust_remote_code=self.trust_remote_code
                 )
+
+                if hasattr(self.hf_model_config, "architectures") and "Qwen2_5_VLForConditionalGeneration" in self.hf_model_config.architectures:
+                    self.torch_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        torch_dtype=dtype_to_torch_dtype(self.data_type),
+                        device_map="cpu",
+                        **kwargs,
+                    ).eval()
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        **kwargs,
+                    )
+                    self.processor = AutoProcessor.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        **kwargs,
+                    )
+                    self.vit_config = Qwen2_5_VLVisionConfig.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=True,
+                        revision=None,
+                        code_revision=None,
+                    )
+                else:
+                    self.torch_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        torch_dtype=dtype_to_torch_dtype(self.data_type),
+                        device_map="cpu",
+                        **kwargs,
+                    ).eval()
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        **kwargs,
+                    )
+                    self.vit_config = Qwen2VLVisionConfig.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=True,
+                        revision=None,
+                        code_revision=None,
+                    )
+                pass
             except Exception as e:
                 print(
                     f"exception when load model: {self.hf_model_path} , exception: {e}"
@@ -102,10 +139,10 @@ def read_model_config(self):
             self.hf_model_config = AutoConfig.from_pretrained(
                 self.hf_model_path, trust_remote_code=self.trust_remote_code
             )
-            self.adapter = QWen2ConfigAdapter(self.hf_model_config)
-            self.as_model_config = self.adapter.model_config
-            if self.user_set_data_type is None:
-                self.data_type = self.adapter.get_model_data_type()
+        self.adapter = QWen2ConfigAdapter(self.hf_model_config)
+        self.as_model_config = self.adapter.model_config
+        if self.user_set_data_type is None:
+            self.data_type = self.adapter.get_model_data_type()
         return self
 
     def serialize(
@@ -127,17 +164,26 @@ def serialize(
             onnx_trt_obj.export_onnx(onnxFile)
             onnx_trt_obj.generate_trt_engine(onnxFile, self.vision_model_path)
         elif self.vision_engine == "transformers":
-            visual_model = Qwen2VLForConditionalGeneration.from_pretrained(
+            if hasattr(self.hf_model_config, "architectures") and "Qwen2_5_VLForConditionalGeneration" in self.hf_model_config.architectures:
+                visual_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                     self.hf_model_path,
                     trust_remote_code=self.trust_remote_code,
                     torch_dtype=dtype_to_torch_dtype(self.data_type),
-                    device_map="cpu",
-                    attn_implementation="flash_attention_2",
+                    device_map="auto",
+                    attn_implementation="sdpa",
+                ).visual.eval()
+            else:
+                visual_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    self.hf_model_path,
+                    trust_remote_code=self.trust_remote_code,
+                    torch_dtype=dtype_to_torch_dtype(self.data_type),
+                    device_map="auto",
+                    attn_implementation="sdpa",
                 ).visual.eval()
             self.vision_model_path = visual_model
         else:
             raise ValueError(f"unsupported engine {self.vision_engine}")
-        
+
         # Convert Allspark LLM
         enable_quant = False
         weight_only_quant=False
diff --git a/multimodal/requirements.txt b/multimodal/requirements.txt
index 5e87ce66b..c96f0909a 100644
--- a/multimodal/requirements.txt
+++ b/multimodal/requirements.txt
@@ -1,9 +1,9 @@
 dashinfer@https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc3/dashinfer-2.0.0rc3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 av
-numpy==1.24.3
-requests==2.32.3
-nvtx==0.2.10
-transformers>=4.45.0
+numpy>=1.24.3
+requests>=2.32.3
+nvtx>=0.2.10
+transformers>=4.48.9
 cachetools>=5.4.0
 six
 tiktoken
@@ -12,7 +12,7 @@ shortuuid
 fastapi
 pydantic_settings
 uvicorn
-cmake==3.22.6
+cmake>=3.22.6
 modelscope
 aiohttp
 onnx