-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathengine_manager.py
More file actions
127 lines (115 loc) · 5.05 KB
/
engine_manager.py
File metadata and controls
127 lines (115 loc) · 5.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
"""
Engine management utilities for vLLM Engine-Direct Connection Ceiling Benchmark
"""
import sys
from vllm import SamplingParams
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
from config import BenchmarkConfig
def create_engine(config: BenchmarkConfig) -> AsyncLLMEngine:
"""
Create and initialize AsyncLLMEngine with the given configuration.
"""
engine_kwargs = {
"model": config.model,
"dtype": config.dtype,
"tensor_parallel_size": config.tensor_parallel_size,
"gpu_memory_utilization": config.gpu_memory_utilization,
"max_model_len": config.max_model_len,
"swap_space": config.swap_space,
"enforce_eager": config.enforce_eager,
"trust_remote_code": config.trust_remote_code,
}
if config.max_num_seqs is not None:
engine_kwargs["max_num_seqs"] = config.max_num_seqs
if config.max_num_batched_tokens is not None:
engine_kwargs["max_num_batched_tokens"] = config.max_num_batched_tokens
try:
engine_args = AsyncEngineArgs(**engine_kwargs)
return AsyncLLMEngine.from_engine_args(engine_args)
except (TypeError, RuntimeError, Exception) as e:
error_msg = str(e)
error_repr = repr(e)
import traceback
tb_str = "".join(traceback.format_exception(type(e), e, e.__traceback__))
# Check for the specific processor type error (can appear in error message, repr, or traceback)
# Also check for "Engine core initialization failed" which often wraps the processor error
is_processor_error = (
"ProcessorMixin" in error_msg
or "PreTrainedTokenizerFast" in error_msg
or "Invalid type of HuggingFace processor" in error_msg
or "ProcessorMixin" in error_repr
or "PreTrainedTokenizerFast" in error_repr
or "ProcessorMixin" in tb_str
or "PreTrainedTokenizerFast" in tb_str
or "Invalid type of HuggingFace processor" in tb_str
)
# Check if it's an engine initialization failure (which often contains processor errors in logs)
is_engine_init_failure = (
"Engine core initialization failed" in error_msg
or "WorkerProc initialization failed" in error_msg
or "WorkerProc initialization failed" in tb_str
)
# For GLM-4.6V specifically, this is almost certainly the processor issue
is_glm_model = "GLM" in config.model or "glm" in config.model.lower()
if is_processor_error or (is_engine_init_failure and is_glm_model):
print("\n" + "=" * 80, file=sys.stderr)
print("ERROR: vLLM processor type mismatch detected", file=sys.stderr)
print("=" * 80, file=sys.stderr)
print(
f"\nThe model '{config.model}' appears to be a multimodal model,",
file=sys.stderr,
)
print(
"but vLLM is encountering a processor type mismatch during initialization.",
file=sys.stderr,
)
print(
"\nThis is a known compatibility issue with some multimodal models in vLLM v1.",
file=sys.stderr,
)
print(
"The error occurs when vLLM tries to load a processor for multimodal profiling,",
file=sys.stderr,
)
print(
"but receives a tokenizer instead of the expected ProcessorMixin.",
file=sys.stderr,
)
if is_engine_init_failure and not is_processor_error:
print(
"\nNote: The actual processor error may be visible in the worker process logs above.",
file=sys.stderr,
)
print("\nPossible solutions:", file=sys.stderr)
print(
"1. Check if the model is compatible with your vLLM version",
file=sys.stderr,
)
print("2. Try updating vLLM to the latest version:", file=sys.stderr)
print(" pip install --upgrade vllm", file=sys.stderr)
print(
"3. Use a different model that is known to work with vLLM",
file=sys.stderr,
)
print(
"4. Check vLLM GitHub issues for this specific model:", file=sys.stderr
)
print(
f" https://github.com/vllm-project/vllm/issues?q={config.model.replace('/', '%2F')}",
file=sys.stderr,
)
print("\nOriginal error:", file=sys.stderr)
print("-" * 80, file=sys.stderr)
raise
def create_sampling_params(config: BenchmarkConfig) -> SamplingParams:
"""
Create sampling parameters for the benchmark.
"""
return SamplingParams(
max_tokens=config.max_new_tokens,
temperature=0.7, # Add randomness for variety
top_p=0.9, # Nucleus sampling for better quality
top_k=50, # Limit to top 50 tokens for diversity
)