Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ Running verifier-guided inference requires only a few lines of code: just specif
**Set up target LLM server**
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-30B-A3B-Thinking-2507 \
--max-model-len 65536 \
--port 8000 \
--tensor-parallel-size 8
--model microsoft/Phi-4-reasoning \
--max-model-len 32768 \
--port 8001 \
--tensor-parallel-size 2
```

**Generate answer enabled with given monitors**
Expand Down
91 changes: 65 additions & 26 deletions examples/EarlyStopping/game24_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def init_llm_server(modelname, max_tokens=200, port=8000):
"top_k": 20,
"top_p": 0.95,
"min_p": 0.0,
"do_sample" : True,
"temperature": 0.6,
"stream": True,
"logprobs": 20,
Expand Down Expand Up @@ -113,10 +114,22 @@ def count_tokens(text, tokenizer):

def extract_solution(text):

# Only search for \boxed{} AFTER </think> to avoid grabbing unverified
# expressions from inside the thinking trace.
# If model opened <think> but never closed it (hit token limit), there is
# no final answer — return None.
if '</think>' in text:
search_text = text[text.rfind('</think>'):]
elif '<think>' in text:
# Model started thinking but never finished — no verified answer
return None
else:
search_text = text

# Use a more robust extraction that handles nested braces in \boxed{}
# Find \boxed{ and then match braces properly
boxed_pattern = r"\\boxed\{"
matches = list(re.finditer(boxed_pattern, text))
matches = list(re.finditer(boxed_pattern, search_text))
if not matches:
return None

Expand All @@ -125,14 +138,18 @@ def extract_solution(text):
start = last_match.end() # Position right after \boxed{
brace_count = 1
end = start
while end < len(text) and brace_count > 0:
if text[end] == '{':
while end < len(search_text) and brace_count > 0:
if search_text[end] == '{':
brace_count += 1
elif text[end] == '}':
elif search_text[end] == '}':
brace_count -= 1
end += 1

expr = text[start:end-1].strip() # -1 to exclude the closing brace
expr = search_text[start:end-1].strip() # -1 to exclude the closing brace

# Skip empty \boxed{} (e.g., from verifier feedback "Wrap in \boxed{}.")
if not expr:
return None

# 1. Convert \frac{a}{b} to (a/b)
frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
Expand All @@ -148,8 +165,16 @@ def extract_solution(text):
for latex, op in replacements.items():
expr = expr.replace(latex, op)

# 3. Cleanup (remove LaTeX spacing)
# 2b. Replace Unicode math operators (QwQ frequently uses these)
expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
expr = expr.replace('\u2013', '-').replace('\u2014', '-') # en-dash, em-dash

# 3. Cleanup (remove LaTeX formatting artifacts)
expr = expr.replace(r"\,", "").replace(r"\ ", "")
expr = expr.replace(r"\left", "").replace(r"\right", "")

# 3b. Strip trailing "= <number>" (e.g., "10 - 8/8 * 1 = 24" -> "10 - 8/8 * 1")
expr = re.sub(r'\s*=\s*[\d.]+\s*$', '', expr)

# 4. Handle implicit multiplication (e.g., "(11+1)(1+1)" -> "(11+1)*(1+1)")
# Insert * between: )( , )number, number(, )(
Expand Down Expand Up @@ -183,7 +208,6 @@ def evaluate_expression(expr, expected_nums=None):
except Exception:
return False


def evaluate_game24_answer(answer, nums):
"""
Evaluate a Game24 answer and return (is_correct, expr, error_message).
Expand Down Expand Up @@ -214,7 +238,7 @@ def evaluate_game24_answer(answer, nums):

parser = argparse.ArgumentParser(description="Game of 24 step-by-step solver with monitors")
parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
parser.add_argument("--num_examples", "-n", type=int, default=1362, help="Number of examples to run")
parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
Expand Down Expand Up @@ -249,7 +273,7 @@ def evaluate_game24_answer(answer, nums):

dataset = load_game24_dataset()

llm_server = init_llm_server(main_model, max_tokens=32768)
llm_server = init_llm_server(main_model, max_tokens=32768, port=8000)

# Load tokenizer for accurate token counting
logger.info(f"Loading tokenizer for {main_model}...")
Expand All @@ -273,30 +297,45 @@ def evaluate_game24_answer(answer, nums):
if args.monitor:
# Use K-stable answer monitor to detect when equation stabilizes k times
# monitors = (SimpleTextReplaceMonitor("IsCheck", "</think>", async_execution=False),)
# monitors=(KstableAnswerGame24Monitor(
# name="game24_kstable",
# k=3,
# expected_nums=nums, # Validate equations use exactly these numbers
# answer_start_token="</think>"
# ),)
monitors = (
EATMonitor(
name="EAT_monitor",
model_name=earlystop_model,
alpha=0.2,
delta=0.02,
min_steps=4,
answer_start_token="</think>",
async_execution=True
),
)
monitors=(KstableAnswerGame24Monitor(
name="game24_kstable",
k=2,
expected_nums=nums, # Validate equations use exactly these numbers
answer_start_token="</think>"
),)
# monitors = (
# EATMonitor(
# name="EAT_monitor",
# model_name=earlystop_model,
# alpha=0.2,
# delta=0.02,
# min_steps=4,
# answer_start_token="</think>",
# async_execution=True
# ),
# )
else:
monitors = ()

logger.info(f"---- length of monitors {len(monitors)} ----")
logger.info(f"---- Example {idx+1} ----")
logger.info(f"Numbers: {nums}")

# system_prompt = (
# "You are Phi, a language model trained by Microsoft to help users. "
# "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
# "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
# "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
# "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
# "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
# "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
# "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
# "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
# "explorations, and reflections from the Thought section, systematically present the final solution that you "
# "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
# "to reach the conclusion. Now, try to solve the following question through the above guidelines."
# )

answer = asyncio.run(stream_completion(
f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
llm_server=llm_server,
Expand Down
83 changes: 33 additions & 50 deletions examples/EarlyStopping/maze_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
short_name = short_name.replace(" ", "_").replace(":", "-")
return short_name

def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/MazeResults"):
def get_output_dirs(main_model: str, base_dir: str = "../Outputs/MazeResults"):
"""Create and return output directory paths based on model name."""
model_short_name = get_model_short_name(main_model)
output_base = os.path.join(base_dir, model_short_name)
Expand All @@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/MazeResults"

return dirs

def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/MazeResults") -> str:
def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/MazeResults") -> str:
"""Generate log filename based on model name."""
model_short_name = get_model_short_name(main_model)
output_base = os.path.join(base_dir, model_short_name)
os.makedirs(output_base, exist_ok=True)
return os.path.join(output_base, f"EAT_{num_examples}examples.log")

def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/MazeResults") -> str:
def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/MazeResults") -> str:
"""Generate token CSV filename based on model name."""
model_short_name = get_model_short_name(main_model)
output_base = os.path.join(base_dir, model_short_name)
Expand All @@ -66,10 +66,10 @@ def remove_last_paragraph(s: str) -> str:
logger = logging.getLogger(__name__)

def load_maze_dataset(split="val"):
ds = load_dataset("microsoft/VISION_LANGUAGE", "maze", split=split)
ds = load_dataset("microsoft/VISION_LANGUAGE", "maze_text_only", split=split)
return ds

def init_llm_server(modelname, max_tokens=200, port=8000): #
def init_llm_server(modelname, max_tokens=200, port=8000):
url = f"http://localhost:{port}/v1/completions"
payload = {
"model": modelname,
Expand Down Expand Up @@ -101,19 +101,26 @@ def build_prompt_from_example(example): #(original prompt config)
return pre_prompt , description


def extract_solution(text):
matches = re.findall(r"\\boxed\{([^}]*)\}", text)
if not matches:
return None

expr = matches[-1].strip() # take last boxed content

# find one of A/B/C/D inside the boxed content
choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
if not choice_match:
return None

return choice_match.group(1).upper()
def extract_solution_mcq(text):
"""Extract MCQ solution from model output."""
# Try multiple boxed patterns
patterns = [
r"\\boxed\{([^}]*)\}", # \boxed{...}
r"boxed\{([^}]*)\}", # boxed{...} without escape
r"\*\*([A-D])\*\*", # **A** format
r"answer[:\s]*([A-D])", # answer: A format
r"(?:^|\n)([A-D])(?:\s|$|\.)", # Standalone letter
]

for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
expr = matches[-1].strip()
choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
if choice_match:
return choice_match.group(1).upper()

return None

def save_prompt(idx, prompt_with_answer, reason_dir):
filename = os.path.join(reason_dir, f"reason_{idx}.txt")
Expand All @@ -127,54 +134,30 @@ def count_tokens(text, tokenizer):
return len(tokens)


def evaluate_maze_answer(answer, options, ground_truth):
"""
Evaluate a Maze MCQ answer and return (is_correct, extracted_answer, message).

Args:
answer: Raw model output
options: Dictionary mapping option letters (A/B/C/D) to their values
ground_truth: The correct answer value

Returns:
Tuple of (is_correct, extracted_answer, message)
"""
sol = extract_solution(answer)
def evaluate_mcq_answer(answer, options, ground_truth):
sol = extract_solution_mcq(answer)
gt_sol = str(ground_truth).strip()

if not sol:
return False, None, "No expression found"

sol = sol.strip()

# Case 1: LLM returned option letter (A/B/C/D)
if sol in options:
if options[sol] == gt_sol:
return True, sol, f"Correct: option {sol} -> {options[sol]}"
else:
return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"

# Case 2: LLM returned the actual answer text
# First check if sol matches ground truth directly
return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
if sol.lower() == gt_sol.lower():
return True, sol, f"Correct: answer text matches ground truth: {sol}"

# Check if sol matches any option value
for opt_letter, opt_value in options.items():
if sol.lower() == opt_value.lower():
if opt_value == gt_sol:
return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
else:
return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"

return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
return False, sol, f"Solution '{sol}' not found in options or ground truth"


if __name__ == "__main__":

parser = argparse.ArgumentParser(description="Maze problem solver with LLM and monitors")
parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
parser.add_argument("--num_examples", "-n", type=int, default=1500, help="Number of examples to run")
parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
Expand Down Expand Up @@ -207,7 +190,7 @@ def evaluate_maze_answer(answer, options, ground_truth):

dataset = load_maze_dataset()

llm_server = init_llm_server(main_model, max_tokens=15000)
llm_server = init_llm_server(main_model, max_tokens=32768)

# Load tokenizer for accurate token counting
logger.info(f"Loading tokenizer for {main_model}...")
Expand All @@ -219,7 +202,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
total_generated_tokens = 0
generated_token_counts = []
total = len(dataset)
indices = np.linspace(3000, total-1, N, dtype=int).tolist()
indices = np.linspace(0, total-1, N, dtype=int).tolist()

for idx in indices:
example = dataset[idx]
Expand Down Expand Up @@ -268,7 +251,7 @@ def evaluate_maze_answer(answer, options, ground_truth):

# Evaluate the answer
gt_sol = str(example.get("ground_truth", "")).strip()
is_correct, extracted_answer, message = evaluate_maze_answer(answer, options, gt_sol)
is_correct, extracted_answer, message = evaluate_mcq_answer(answer, options, gt_sol)

if extracted_answer:
logger.info(f"Extracted answer: {extracted_answer}")
Expand Down
Loading
Loading