vllm-benchmark/engine_manager.py at main · futuritywork/vllm-benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
"""
Engine management utilities for vLLM Engine-Direct Connection Ceiling Benchmark
"""

import sys
from vllm import SamplingParams
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs

from config import BenchmarkConfig


def create_engine(config: BenchmarkConfig) -> AsyncLLMEngine:
    """
    Create and initialize AsyncLLMEngine with the given configuration.
    """
    engine_kwargs = {
        "model": config.model,
        "dtype": config.dtype,
        "tensor_parallel_size": config.tensor_parallel_size,
        "gpu_memory_utilization": config.gpu_memory_utilization,
        "max_model_len": config.max_model_len,
        "swap_space": config.swap_space,
        "enforce_eager": config.enforce_eager,
        "trust_remote_code": config.trust_remote_code,
    }
    if config.max_num_seqs is not None:
        engine_kwargs["max_num_seqs"] = config.max_num_seqs
    if config.max_num_batched_tokens is not None:
        engine_kwargs["max_num_batched_tokens"] = config.max_num_batched_tokens

    try:
        engine_args = AsyncEngineArgs(**engine_kwargs)
        return AsyncLLMEngine.from_engine_args(engine_args)
    except (TypeError, RuntimeError, Exception) as e:
        error_msg = str(e)
        error_repr = repr(e)
        import traceback

        tb_str = "".join(traceback.format_exception(type(e), e, e.__traceback__))

        # Check for the specific processor type error (can appear in error message, repr, or traceback)
        # Also check for "Engine core initialization failed" which often wraps the processor error
        is_processor_error = (
            "ProcessorMixin" in error_msg
            or "PreTrainedTokenizerFast" in error_msg
            or "Invalid type of HuggingFace processor" in error_msg
            or "ProcessorMixin" in error_repr
            or "PreTrainedTokenizerFast" in error_repr
            or "ProcessorMixin" in tb_str
            or "PreTrainedTokenizerFast" in tb_str
            or "Invalid type of HuggingFace processor" in tb_str
        )

        # Check if it's an engine initialization failure (which often contains processor errors in logs)
        is_engine_init_failure = (
            "Engine core initialization failed" in error_msg
            or "WorkerProc initialization failed" in error_msg
            or "WorkerProc initialization failed" in tb_str
        )

        # For GLM-4.6V specifically, this is almost certainly the processor issue
        is_glm_model = "GLM" in config.model or "glm" in config.model.lower()

        if is_processor_error or (is_engine_init_failure and is_glm_model):
            print("\n" + "=" * 80, file=sys.stderr)
            print("ERROR: vLLM processor type mismatch detected", file=sys.stderr)
            print("=" * 80, file=sys.stderr)
            print(
                f"\nThe model '{config.model}' appears to be a multimodal model,",
                file=sys.stderr,
            )
            print(
                "but vLLM is encountering a processor type mismatch during initialization.",
                file=sys.stderr,
            )
            print(
                "\nThis is a known compatibility issue with some multimodal models in vLLM v1.",
                file=sys.stderr,
            )
            print(
                "The error occurs when vLLM tries to load a processor for multimodal profiling,",
                file=sys.stderr,
            )
            print(
                "but receives a tokenizer instead of the expected ProcessorMixin.",
                file=sys.stderr,
            )
            if is_engine_init_failure and not is_processor_error:
                print(
                    "\nNote: The actual processor error may be visible in the worker process logs above.",
                    file=sys.stderr,
                )
            print("\nPossible solutions:", file=sys.stderr)
            print(
                "1. Check if the model is compatible with your vLLM version",
                file=sys.stderr,
            )
            print("2. Try updating vLLM to the latest version:", file=sys.stderr)
            print("   pip install --upgrade vllm", file=sys.stderr)
            print(
                "3. Use a different model that is known to work with vLLM",
                file=sys.stderr,
            )
            print(
                "4. Check vLLM GitHub issues for this specific model:", file=sys.stderr
            )
            print(
                f"   https://github.com/vllm-project/vllm/issues?q={config.model.replace('/', '%2F')}",
                file=sys.stderr,
            )
            print("\nOriginal error:", file=sys.stderr)
            print("-" * 80, file=sys.stderr)
        raise


def create_sampling_params(config: BenchmarkConfig) -> SamplingParams:
    """
    Create sampling parameters for the benchmark.
    """
    return SamplingParams(
        max_tokens=config.max_new_tokens,
        temperature=0.7,  # Add randomness for variety
        top_p=0.9,  # Nucleus sampling for better quality
        top_k=50,  # Limit to top 50 tokens for diversity
    )