From 8e5bd2aa1d4b00b1e9c9109cb09cdca1ce71ac3a Mon Sep 17 00:00:00 2001 From: Jiejing Zhang Date: Fri, 23 May 2025 17:57:00 +0800 Subject: [PATCH] mm: add qwen vl2.5 model support. - add qwen vl 2.5 model support. - Qwen VL2.5 only support 'transformers' as vit engine, (trt not support yet.) - upgrade package version to make sure VL2.5 code is added. test command: server: `dashinfer_vlm_serve --model qwen/Qwen2.5-VL-3B-Instruct --vision_engine transformers --port 8000 --host=127.0.0.1` client: ``` curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d \ '{"model": "qwen/Qwen2.5-VL-3B-Instruct", "messages": [{"role": "user", "content": [{ "type": "text", "text": "Describe the image." }, {"type": "image_url", "image_url": {"url": "https://farm4.staticflickr.com/3075/3168662394_7d7103de7d_z_d.jpg"}}]}], "max_completion_tokens": 1024, "top_p": 0.5, "temperature": 0.1, "frequency_penalty": 1.05 }' ``` result: ``` {"id":"chatcmpl-rxqDiCQEJweEeeB7FADiER","object":"chat.completion", "created":1747992522,"model":"model","choices":[{"index":0,"message":{"role":"assistant","content":"The image features a small hummingbird perched on a branch. The bird is positioned in the center of the scene, with its vibrant colors and delicate features clearly visible. The hummingbird appears to be enjoying its time in nature, possibly searching for food or simply resting on the branch. \n\nThere are no other birds or animals present in the image, making it a solitary moment captured in this natural setting."},"finish_reason":"stop"}],"usage":{"prompt_tokens":382,"total_tokens":95,"completion_tokens":81}} ``` --- .../vl_inference/utils/model_loader.py | 102 +++++++++++++----- multimodal/requirements.txt | 10 +- 2 files changed, 79 insertions(+), 33 deletions(-) diff --git a/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py b/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py index 8af2ed253..f6f6aaa41 100644 --- a/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py +++ b/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py @@ -6,10 +6,14 @@ import torch import glob import warnings -from modelscope import snapshot_download -from transformers import Qwen2VLForConditionalGeneration, AutoConfig, AutoTokenizer -from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig from tqdm import tqdm + +from transformers import AutoConfig, AutoTokenizer, AutoProcessor + +from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration +from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig + from safetensors.torch import safe_open from dashinfer import allspark from dashinfer.allspark.model_loader import HuggingFaceModel, ModelSerializerException @@ -59,25 +63,58 @@ def load_model( # the open-source model can be loaded by huggingface try: if not os.path.isdir(self.hf_model_path): + from modelscope import snapshot_download self.hf_model_path = snapshot_download(self.hf_model_path) - self.torch_model = Qwen2VLForConditionalGeneration.from_pretrained( - self.hf_model_path, - trust_remote_code=self.trust_remote_code, - torch_dtype=dtype_to_torch_dtype(self.data_type), - device_map="cpu", - **kwargs, - ).eval() - self.vit_config = Qwen2VLVisionConfig.from_pretrained( - self.hf_model_path, - trust_remote_code=True, - revision=None, - code_revision=None, - ) - self.tokenizer = AutoTokenizer.from_pretrained( - self.hf_model_path, - trust_remote_code=self.trust_remote_code, - **kwargs, + + # Read config to determine model architecture + self.hf_model_config = AutoConfig.from_pretrained( + self.hf_model_path, trust_remote_code=self.trust_remote_code ) + + if hasattr(self.hf_model_config, "architectures") and "Qwen2_5_VLForConditionalGeneration" in self.hf_model_config.architectures: + self.torch_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + self.hf_model_path, + trust_remote_code=self.trust_remote_code, + torch_dtype=dtype_to_torch_dtype(self.data_type), + device_map="cpu", + **kwargs, + ).eval() + self.tokenizer = AutoTokenizer.from_pretrained( + self.hf_model_path, + trust_remote_code=self.trust_remote_code, + **kwargs, + ) + self.processor = AutoProcessor.from_pretrained( + self.hf_model_path, + trust_remote_code=self.trust_remote_code, + **kwargs, + ) + self.vit_config = Qwen2_5_VLVisionConfig.from_pretrained( + self.hf_model_path, + trust_remote_code=True, + revision=None, + code_revision=None, + ) + else: + self.torch_model = Qwen2VLForConditionalGeneration.from_pretrained( + self.hf_model_path, + trust_remote_code=self.trust_remote_code, + torch_dtype=dtype_to_torch_dtype(self.data_type), + device_map="cpu", + **kwargs, + ).eval() + self.tokenizer = AutoTokenizer.from_pretrained( + self.hf_model_path, + trust_remote_code=self.trust_remote_code, + **kwargs, + ) + self.vit_config = Qwen2VLVisionConfig.from_pretrained( + self.hf_model_path, + trust_remote_code=True, + revision=None, + code_revision=None, + ) + pass except Exception as e: print( f"exception when load model: {self.hf_model_path} , exception: {e}" @@ -102,10 +139,10 @@ def read_model_config(self): self.hf_model_config = AutoConfig.from_pretrained( self.hf_model_path, trust_remote_code=self.trust_remote_code ) - self.adapter = QWen2ConfigAdapter(self.hf_model_config) - self.as_model_config = self.adapter.model_config - if self.user_set_data_type is None: - self.data_type = self.adapter.get_model_data_type() + self.adapter = QWen2ConfigAdapter(self.hf_model_config) + self.as_model_config = self.adapter.model_config + if self.user_set_data_type is None: + self.data_type = self.adapter.get_model_data_type() return self def serialize( @@ -127,17 +164,26 @@ def serialize( onnx_trt_obj.export_onnx(onnxFile) onnx_trt_obj.generate_trt_engine(onnxFile, self.vision_model_path) elif self.vision_engine == "transformers": - visual_model = Qwen2VLForConditionalGeneration.from_pretrained( + if hasattr(self.hf_model_config, "architectures") and "Qwen2_5_VLForConditionalGeneration" in self.hf_model_config.architectures: + visual_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( self.hf_model_path, trust_remote_code=self.trust_remote_code, torch_dtype=dtype_to_torch_dtype(self.data_type), - device_map="cpu", - attn_implementation="flash_attention_2", + device_map="auto", + attn_implementation="sdpa", + ).visual.eval() + else: + visual_model = Qwen2VLForConditionalGeneration.from_pretrained( + self.hf_model_path, + trust_remote_code=self.trust_remote_code, + torch_dtype=dtype_to_torch_dtype(self.data_type), + device_map="auto", + attn_implementation="sdpa", ).visual.eval() self.vision_model_path = visual_model else: raise ValueError(f"unsupported engine {self.vision_engine}") - + # Convert Allspark LLM enable_quant = False weight_only_quant=False diff --git a/multimodal/requirements.txt b/multimodal/requirements.txt index 5e87ce66b..c96f0909a 100644 --- a/multimodal/requirements.txt +++ b/multimodal/requirements.txt @@ -1,9 +1,9 @@ dashinfer@https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc3/dashinfer-2.0.0rc3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl av -numpy==1.24.3 -requests==2.32.3 -nvtx==0.2.10 -transformers>=4.45.0 +numpy>=1.24.3 +requests>=2.32.3 +nvtx>=0.2.10 +transformers>=4.48.9 cachetools>=5.4.0 six tiktoken @@ -12,7 +12,7 @@ shortuuid fastapi pydantic_settings uvicorn -cmake==3.22.6 +cmake>=3.22.6 modelscope aiohttp onnx