microsoft · Vishak-Bhat30 · Feb 17, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/README.md b/README.md
@@ -39,10 +39,10 @@ Running verifier-guided inference requires only a few lines of code: just specif
 **Set up target LLM server**
 ```bash
 python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen3-30B-A3B-Thinking-2507 \
-  --max-model-len 65536 \
-  --port 8000 \
-  --tensor-parallel-size 8
+  --model microsoft/Phi-4-reasoning \
+  --max-model-len 32768 \
+  --port 8001 \
+  --tensor-parallel-size 2
 ```
 
 **Generate answer enabled with given monitors**

diff --git a/examples/EarlyStopping/game24_example.py b/examples/EarlyStopping/game24_example.py
@@ -76,6 +76,7 @@ def init_llm_server(modelname, max_tokens=200, port=8000):
         "top_k": 20,
         "top_p": 0.95,
         "min_p": 0.0,
+        "do_sample" : True,
         "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
@@ -113,10 +114,22 @@ def count_tokens(text, tokenizer):
 
 def extract_solution(text):
 
+    # Only search for \boxed{} AFTER </think> to avoid grabbing unverified
+    # expressions from inside the thinking trace.
+    # If model opened <think> but never closed it (hit token limit), there is
+    # no final answer — return None.
+    if '</think>' in text:
+        search_text = text[text.rfind('</think>'):]
+    elif '<think>' in text:
+        # Model started thinking but never finished — no verified answer
+        return None
+    else:
+        search_text = text
+
     # Use a more robust extraction that handles nested braces in \boxed{}
     # Find \boxed{ and then match braces properly
     boxed_pattern = r"\\boxed\{"
-    matches = list(re.finditer(boxed_pattern, text))
+    matches = list(re.finditer(boxed_pattern, search_text))
     if not matches:
         return None
 
@@ -125,14 +138,18 @@ def extract_solution(text):
     start = last_match.end()  # Position right after \boxed{
     brace_count = 1
     end = start
-    while end < len(text) and brace_count > 0:
-        if text[end] == '{':
+    while end < len(search_text) and brace_count > 0:
+        if search_text[end] == '{':
             brace_count += 1
-        elif text[end] == '}':
+        elif search_text[end] == '}':
             brace_count -= 1
         end += 1
 
-    expr = text[start:end-1].strip()  # -1 to exclude the closing brace
+    expr = search_text[start:end-1].strip()  # -1 to exclude the closing brace
+
+    # Skip empty \boxed{} (e.g., from verifier feedback "Wrap in \boxed{}.")
+    if not expr:
+        return None
 
     # 1. Convert \frac{a}{b} to (a/b)
     frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
@@ -148,8 +165,16 @@ def extract_solution(text):
     for latex, op in replacements.items():
         expr = expr.replace(latex, op)
 
-    # 3. Cleanup (remove LaTeX spacing)
+    # 2b. Replace Unicode math operators (QwQ frequently uses these)
+    expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+    expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
+
+    # 3. Cleanup (remove LaTeX formatting artifacts)
     expr = expr.replace(r"\,", "").replace(r"\ ", "")
+    expr = expr.replace(r"\left", "").replace(r"\right", "")
+
+    # 3b. Strip trailing "= <number>" (e.g., "10 - 8/8 * 1 = 24" -> "10 - 8/8 * 1")
+    expr = re.sub(r'\s*=\s*[\d.]+\s*$', '', expr)
 
     # 4. Handle implicit multiplication (e.g., "(11+1)(1+1)" -> "(11+1)*(1+1)")
     # Insert * between: )( , )number, number(, )(
@@ -183,7 +208,6 @@ def evaluate_expression(expr, expected_nums=None):
     except Exception:
         return False
 
-
 def evaluate_game24_answer(answer, nums):
     """
     Evaluate a Game24 answer and return (is_correct, expr, error_message).
@@ -214,7 +238,7 @@ def evaluate_game24_answer(answer, nums):
 
     parser = argparse.ArgumentParser(description="Game of 24 step-by-step solver with monitors")
     parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
-    parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
+    parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1362, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
@@ -249,7 +273,7 @@ def evaluate_game24_answer(answer, nums):
 
     dataset = load_game24_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=32768)
+    llm_server = init_llm_server(main_model, max_tokens=32768, port=8000)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
@@ -273,30 +297,45 @@ def evaluate_game24_answer(answer, nums):
         if args.monitor:
             # Use K-stable answer monitor to detect when equation stabilizes k times
             # monitors = (SimpleTextReplaceMonitor("IsCheck", "</think>", async_execution=False),)
-            # monitors=(KstableAnswerGame24Monitor(
-            #     name="game24_kstable",
-            #     k=3,
-            #     expected_nums=nums,  # Validate equations use exactly these numbers
-            #     answer_start_token="</think>"
-            # ),)
-            monitors = (
-                EATMonitor(
-                    name="EAT_monitor",
-                    model_name=earlystop_model,
-                    alpha=0.2,
-                    delta=0.02,
-                    min_steps=4,
-                    answer_start_token="</think>",
-                    async_execution=True
-                ),
-            )
+            monitors=(KstableAnswerGame24Monitor(
+                name="game24_kstable",
+                k=2,
+                expected_nums=nums,  # Validate equations use exactly these numbers
+                answer_start_token="</think>"
+            ),)
+            # monitors = (
+            #     EATMonitor(
+            #         name="EAT_monitor",
+            #         model_name=earlystop_model,
+            #         alpha=0.2,
+            #         delta=0.02,
+            #         min_steps=4,
+            #         answer_start_token="</think>",
+            #         async_execution=True
+            #     ),
+            # )
         else:
             monitors = ()
 
         logger.info(f"---- length of monitors {len(monitors)} ----")
         logger.info(f"---- Example {idx+1} ----")
         logger.info(f"Numbers: {nums}")
 
+        # system_prompt = (
+        #     "You are Phi, a language model trained by Microsoft to help users. "
+        #     "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+        #     "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+        #     "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+        #     "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+        #     "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+        #     "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+        #     "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+        #     "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+        #     "explorations, and reflections from the Thought section, systematically present the final solution that you "
+        #     "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+        #     "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        # )
+
         answer = asyncio.run(stream_completion(
             f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
             llm_server=llm_server,

diff --git a/examples/EarlyStopping/maze_example.py b/examples/EarlyStopping/maze_example.py
@@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/MazeResults"):
+def get_output_dirs(main_model: str, base_dir: str = "../Outputs/MazeResults"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/MazeResults"
 
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/MazeResults") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/MazeResults") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/MazeResults") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/MazeResults") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -66,10 +66,10 @@ def remove_last_paragraph(s: str) -> str:
 logger = logging.getLogger(__name__)
 
 def load_maze_dataset(split="val"):
-    ds = load_dataset("microsoft/VISION_LANGUAGE", "maze", split=split)
+    ds = load_dataset("microsoft/VISION_LANGUAGE", "maze_text_only", split=split)
     return ds
 
-def init_llm_server(modelname, max_tokens=200, port=8000): #
+def init_llm_server(modelname, max_tokens=200, port=8000):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": modelname,
@@ -101,19 +101,26 @@ def build_prompt_from_example(example): #(original prompt config)
     return pre_prompt , description
 
 
-def extract_solution(text):
-    matches = re.findall(r"\\boxed\{([^}]*)\}", text)
-    if not matches:
-        return None
-
-    expr = matches[-1].strip()   # take last boxed content
-
-    # find one of A/B/C/D inside the boxed content
-    choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
-    if not choice_match:
-        return None
-
-    return choice_match.group(1).upper()
+def extract_solution_mcq(text):
+    """Extract MCQ solution from model output."""
+    # Try multiple boxed patterns
+    patterns = [
+        r"\\boxed\{([^}]*)\}",  # \boxed{...}
+        r"boxed\{([^}]*)\}",     # boxed{...} without escape
+        r"\*\*([A-D])\*\*",      # **A** format
+        r"answer[:\s]*([A-D])",  # answer: A format
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",  # Standalone letter
+    ]
+
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
+
+    return None
 
 def save_prompt(idx, prompt_with_answer, reason_dir):
     filename = os.path.join(reason_dir, f"reason_{idx}.txt")
@@ -127,54 +134,30 @@ def count_tokens(text, tokenizer):
     return len(tokens)
 
 
-def evaluate_maze_answer(answer, options, ground_truth):
-    """
-    Evaluate a Maze MCQ answer and return (is_correct, extracted_answer, message).
-
-    Args:
-        answer: Raw model output
-        options: Dictionary mapping option letters (A/B/C/D) to their values
-        ground_truth: The correct answer value
-
-    Returns:
-        Tuple of (is_correct, extracted_answer, message)
-    """
-    sol = extract_solution(answer)
+def evaluate_mcq_answer(answer, options, ground_truth):
+    sol = extract_solution_mcq(answer)
     gt_sol = str(ground_truth).strip()
-
     if not sol:
         return False, None, "No expression found"
-
     sol = sol.strip()
-
-    # Case 1: LLM returned option letter (A/B/C/D)
     if sol in options:
         if options[sol] == gt_sol:
             return True, sol, f"Correct: option {sol} -> {options[sol]}"
-        else:
-            return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
-
-    # Case 2: LLM returned the actual answer text
-    # First check if sol matches ground truth directly
+        return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
     if sol.lower() == gt_sol.lower():
         return True, sol, f"Correct: answer text matches ground truth: {sol}"
-
-    # Check if sol matches any option value
     for opt_letter, opt_value in options.items():
         if sol.lower() == opt_value.lower():
             if opt_value == gt_sol:
                 return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
-            else:
-                return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
-
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
     return False, sol, f"Solution '{sol}' not found in options or ground truth"
 
-
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Maze problem solver with LLM and monitors")
     parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
-    parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
+    parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1500, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
@@ -207,7 +190,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
 
     dataset = load_maze_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=15000)
+    llm_server = init_llm_server(main_model, max_tokens=32768)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
@@ -219,7 +202,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
     total_generated_tokens = 0
     generated_token_counts = []
     total = len(dataset)
-    indices = np.linspace(3000, total-1, N, dtype=int).tolist()
+    indices = np.linspace(0, total-1, N, dtype=int).tolist()
 
     for idx in indices:
         example = dataset[idx]
@@ -268,7 +251,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
 
         # Evaluate the answer
         gt_sol = str(example.get("ground_truth", "")).strip()
-        is_correct, extracted_answer, message = evaluate_maze_answer(answer, options, gt_sol)
+        is_correct, extracted_answer, message = evaluate_mcq_answer(answer, options, gt_sol)
 
         if extracted_answer:
             logger.info(f"Extracted answer: {extracted_answer}")